### Capstone Project:
### Segmenting and Clustering Neighborhoods in Toronto

### Part 1 : Importing Toronto post codes from wikipedia into a pd Dataframe

In [1]:
# Librearies importation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import json # library to handle JSON files
import requests # library to handle requests
from pandas import json_normalize # tranform JSON file into a pandas dataframe
import folium # map rendering library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# import k-means from clustering stage
from sklearn.cluster import KMeans


In [2]:
## Toronto postcode dataframe importation

# scraping Toronto postcode table from wikipedia
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
# Dropping rows with Borough 'Not assigned' values
df.drop(df[df['Borough']=='Not assigned'].index,axis=0,inplace=True)
# Replacing '/' by ',' in 'neighborhood' column
df['Neighborhood'] = df['Neighborhood'].apply(lambda x: ','.join(x.split('/')))
# reseting rows index
df = df.reset_index(drop = True)
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


### Assuptions
##### 1 : Only the cells that have an assigned borough have been processed, the others have been droped
##### 2 : If a cell had a borough but a Not assigned neighborhood, the neighborhood will be the same as the borough.

In [4]:
print(f"The shape of the resulting dataframe is {df.shape}")

The shape of the resulting dataframe is (103, 3)


### Part 2 : Adding latitude and logitude to the dataframe

In [3]:
df_geo = pd.read_csv('Geospatial_Coordinates.csv')
df_geo.columns = ['Postal code', 'Latitude', 'Longitude']
df = pd.merge(df,df_geo,on='Postal code')
df

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing CentrE,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,...",43.636258,-79.498509


### Part 3 : Clustering

#### Foursquare credentials:

In [4]:
CLIENT_ID = 'ZC4IJRQBYU4FUZKXVZKHACJAHMUNQXEUVZ4LR4CSM1RCGC1O' # your Foursquare ID
CLIENT_SECRET = 'JJER2S2X5WPSXY2EEMFCVGHQUDBAR0N0DSDPCFNFC1II1RKU' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

#### Keeping only borough with 'Toronto' string in it

In [5]:
df = df[df['Borough'].str.contains("Toronto")].reset_index(drop = True)
df.shape

(39, 5)

#### Use geopy library to get the latitude and longitude values of New York City.

In [6]:
address = 'Toronto, Ontario, Canada'

geolocator = Nominatim(user_agent='Toronto_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

#### Create a map of New York with neighborhoods superimposed on top.

In [7]:
# create map of New York using latitude and longitude values
map_Toronto = folium.Map(location=[latitude,longitude],zoom_start=10)
# add markers to map

for lat,lgt,bor,nei in zip(df['Latitude'],
                           df['Longitude'],
                           df['Borough'],
                           df['Neighborhood']):
    
    label = f'{nei}, {bor}'
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lgt],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)

map_Toronto

#### Let's explore the first neighborhood (postal code) in our dataframe.

In [182]:
df['Neighborhood'][0]

'Regent Park , Harbourfront'

In [8]:
Postalcode_latitude = df['Latitude'][0]
Postalcode_longitude = df['Longitude'][0]
Postalcode_nb = df['Postal code'][0]

#### Now, let's get the top 100 venues that are in 1st neighborhood within a radius of 500 meters.

In [9]:
latitude = Postalcode_latitude
longitude = Postalcode_longitude
Radius = 500
limit = 100

#### Crating a function to request venues from Foursquare

In [11]:
def request_venues(df,n):
    latitude = df['Latitude'][n]
    longitude = df['Longitude'][n]
    url = 'https://api.foursquare.com/v2/venues/explore'
    params = dict(client_id = CLIENT_ID,
              client_secret=CLIENT_SECRET,
              v=VERSION,
              ll=f'{latitude},{longitude}',
              limit=limit)

    resp = requests.get(url=url, params=params)
    data = json.loads(resp.text)
    data = data['response']['groups'][0]['items']
    return data

In [52]:
request_venues(df,0)

[{'reasons': {'count': 0,
   'items': [{'summary': 'This spot is popular',
     'type': 'general',
     'reasonName': 'globalInteractionReason'}]},
  'venue': {'id': '54ea41ad498e9a11e9e13308',
   'name': 'Roselle Desserts',
   'location': {'address': '362 King St E',
    'crossStreet': 'Trinity St',
    'lat': 43.653446723052674,
    'lng': -79.3620167174383,
    'labeledLatLngs': [{'label': 'display',
      'lat': 43.653446723052674,
      'lng': -79.3620167174383}],
    'distance': 143,
    'postalCode': 'M5A 1K9',
    'cc': 'CA',
    'city': 'Toronto',
    'state': 'ON',
    'country': 'Canada',
    'formattedAddress': ['362 King St E (Trinity St)',
     'Toronto ON M5A 1K9',
     'Canada']},
   'categories': [{'id': '4bf58dd8d48988d16a941735',
     'name': 'Bakery',
     'pluralName': 'Bakeries',
     'shortName': 'Bakery',
     'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/bakery_',
      'suffix': '.png'},
     'primary': True}],
   'photos': {'count': 0, 'grou

#### Crating a function to get venues categories for one neighborhood (or postal code)

In [12]:
def Venue_category(data):
    liste_categories = []
    for n in range(0,len(data)):
        liste_categories.append(data[n]['venue']['categories'][0]['name'])
    return pd.DataFrame(liste_categories)

#### Using the 2 above functions to retrieve the venues for each neighborhood from Foursquare

In [76]:
data = pd.DataFrame()
index = []
for n in range(0,df.shape[0]):
    index.append(df['Postal code'][n])
    data[df['Postal code'][n]] = Venue_category(request_venues(df,n))[0]               
data

Unnamed: 0,M5A,M7A,M5B,M5C,M4E,M5E,M5G,M6G,M5H,M6H,...,M4T,M5T,M4V,M5V,M4W,M5W,M4X,M5X,M4Y,M7Y
0,Bakery,Park,Clothing Store,Coffee Shop,Trail,Liquor Store,Coffee Shop,Grocery Store,Concert Hall,Bar,...,Grocery Store,Arts & Crafts Store,Bagel Shop,Harbor / Marina,Grocery Store,Vegetarian / Vegan Restaurant,Diner,Gym / Fitness Center,Dance Studio,Brewery
1,Coffee Shop,Coffee Shop,Tea Room,Japanese Restaurant,Indie Movie Theater,Restaurant,Coffee Shop,Café,Plaza,Middle Eastern Restaurant,...,Bagel Shop,Organic Grocery,French Restaurant,Harbor / Marina,BBQ Joint,Museum,Italian Restaurant,Restaurant,Theme Restaurant,Farmers Market
2,Restaurant,Sushi Restaurant,Comic Shop,Restaurant,Gastropub,Vegetarian / Vegan Restaurant,Coffee Shop,Italian Restaurant,Vegetarian / Vegan Restaurant,Brewery,...,Liquor Store,Bakery,Supermarket,Garden,Juice Bar,Park,Japanese Restaurant,Gym,Mexican Restaurant,Comic Shop
3,Historic Site,Italian Restaurant,Theater,Creperie,Vegetarian / Vegan Restaurant,Concert Hall,Coffee Shop,Park,Speakeasy,Coffee Shop,...,Spa,Mexican Restaurant,Spa,Airport,Farmers Market,Restaurant,Indian Restaurant,Bakery,Bubble Tea Shop,Burrito Place
4,Farmers Market,Creperie,Plaza,Food Truck,Bakery,Museum,Middle Eastern Restaurant,Indian Restaurant,Restaurant,Coffee Shop,...,Coffee Shop,Café,Tapas Restaurant,Coffee Shop,Historic Site,Cocktail Bar,Restaurant,Coffee Shop,Gastropub,Brewery
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Karaoke Bar,,Clothing Store,Japanese Restaurant,Gaming Cafe,Eastern European Restaurant,Mediterranean Restaurant,Sushi Restaurant,Hotel,Café,...,Thai Restaurant,Music Venue,Vegetarian / Vegan Restaurant,History Museum,Bank,Breakfast Spot,American Restaurant,Taco Place,Strip Club,Coffee Shop
96,Café,,Liquor Store,Italian Restaurant,Cocktail Bar,Yoga Studio,Café,Mexican Restaurant,Restaurant,Gym,...,French Restaurant,Bar,Paper / Office Supplies Store,New American Restaurant,Hotel,Beer Bar,Park,Cosmetics Shop,Japanese Restaurant,Dive Bar
97,Bar,,Breakfast Spot,Clothing Store,Indian Restaurant,Japanese Restaurant,Restaurant,Indian Restaurant,Restaurant,Brazilian Restaurant,...,Bakery,Café,Pub,Coffee Shop,Grocery Store,Movie Theater,Smoke Shop,American Restaurant,Food & Drink Shop,BBQ Joint
98,Curling Ice,,Fried Chicken Joint,Breakfast Spot,Sandwich Place,Art Gallery,Theater,Japanese Restaurant,Clothing Store,Diner,...,Fish Market,Arepa Restaurant,Frozen Yogurt Shop,Brewery,Bank,Shopping Mall,Pub,Restaurant,Sculpture Garden,Government Building


#### One hot encoding before clustering with K-means

In [188]:
data2 = 0
data2 = data.transpose()
data2 = pd.get_dummies(data2,prefix='',prefix_sep='')
#data2 = data2.reset_index()
#data2.columns = ['Postal code'] + list(data2.columns[1:])
data2 = data2.groupby(level = 0 , axis=1).sum()
print(data2.shape)
data2.head()


(39, 274)


Unnamed: 0,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,Arepa Restaurant,Art Gallery,Art Museum,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo
M5A,0,0,0,0,1,0,0,0,2,0,...,0,0,0,0,0,0,0,0,1,0
M7A,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,2,0
M5B,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M5C,0,0,3,0,0,0,0,0,2,0,...,1,0,0,0,0,1,0,0,0,0
M4E,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


#### Clustering with K-Means

In [223]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(data2)
kmeans.labels_

array([0, 0, 0, 2, 1, 2, 0, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 3, 3, 3, 3,
       1, 3, 1, 1, 3, 1, 1, 3, 1, 3, 2, 3, 2, 1, 2, 0, 1])

#### Reshaping the data with 10 most common venue by neighborhood ans adding cluster label

In [235]:
col = []
for n in range(1,11):
    col.append(f'n°{n} must common venue')

data3 = []
for n in range(0,len(data2)):
            data3.append(list(data2.iloc[n,:].sort_values(ascending=False)[0:10].index))
data3 = pd.DataFrame(data=data3,index=data2.index,columns=col)
data3['label'] = kmeans.labels_
data3 = data3.reset_index()
data3.columns = ['Postal code'] + list(data3.columns[1:])
data3
data4 = pd.merge(data3,df,on='Postal code')
data4.head()

Unnamed: 0,Postal code,n°1 must common venue,n°2 must common venue,n°3 must common venue,n°4 must common venue,n°5 must common venue,n°6 must common venue,n°7 must common venue,n°8 must common venue,n°9 must common venue,n°10 must common venue,label,Borough,Neighborhood,Latitude,Longitude
0,M5A,Coffee Shop,Café,Restaurant,Theater,Bakery,Farmers Market,Park,Italian Restaurant,Pub,Breakfast Spot,0,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
1,M7A,Coffee Shop,Sandwich Place,Sushi Restaurant,Italian Restaurant,Park,Gastropub,Japanese Restaurant,Café,Yoga Studio,Diner,0,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
2,M5B,Coffee Shop,Clothing Store,Restaurant,Tea Room,Bubble Tea Shop,Burger Joint,Japanese Restaurant,Café,Chinese Restaurant,Theater,0,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Coffee Shop,Café,Seafood Restaurant,Italian Restaurant,Bakery,Gastropub,American Restaurant,Cosmetics Shop,Hotel,Theater,2,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,Coffee Shop,Park,Beach,Pub,Breakfast Spot,Café,Bakery,Indian Restaurant,Ice Cream Shop,BBQ Joint,1,East Toronto,The Beaches,43.676357,-79.293031


#### Cluster vizualisation on Toronto Map

In [238]:
# create map of New York using latitude and longitude values
map_Toronto = folium.Map(location=[latitude,longitude],zoom_start=10)
# add markers to map

for lat,lgt,bor,nei,clus in zip(data4['Latitude'],
                           data4['Longitude'],
                           data4['Borough'],
                           data4['Neighborhood'],
                           data4['label']):
    
    label = f'{nei}, {bor}, Cluster {clus}'
    label = folium.Popup(label, parse_html=True)
    
    for n, color in enumerate(['blue','red','green','yellow']):
        if clus == n:
            folium.CircleMarker(
                [lat, lgt],
                radius=5,
                popup=label,
                color=color,
                fill=True,
                fill_color='#3186cc',
                fill_opacity=0.7,
                parse_html=False).add_to(map_Toronto)

map_Toronto

#### defining clusters 0

In [245]:
data4[data4['label']==0].iloc[:,:11].set_index('Postal code')

Unnamed: 0_level_0,n°1 must common venue,n°2 must common venue,n°3 must common venue,n°4 must common venue,n°5 must common venue,n°6 must common venue,n°7 must common venue,n°8 must common venue,n°9 must common venue,n°10 must common venue
Postal code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
M5A,Coffee Shop,Café,Restaurant,Theater,Bakery,Farmers Market,Park,Italian Restaurant,Pub,Breakfast Spot
M7A,Coffee Shop,Sandwich Place,Sushi Restaurant,Italian Restaurant,Park,Gastropub,Japanese Restaurant,Café,Yoga Studio,Diner
M5B,Coffee Shop,Clothing Store,Restaurant,Tea Room,Bubble Tea Shop,Burger Joint,Japanese Restaurant,Café,Chinese Restaurant,Theater
M5G,Coffee Shop,Café,Restaurant,Japanese Restaurant,Bar,Art Gallery,French Restaurant,Sushi Restaurant,Ramen Restaurant,Sandwich Place
M4Y,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Gay Bar,Diner,Gastropub,Liquor Store,Bookstore,Ramen Restaurant


#### Defining cluster 0

In [261]:
num_cluster = 0
cluster = data4[data4['label']==num_cluster].iloc[:,:11].set_index('Postal code')

serie_venue= pd.Series()
for col in list(cluster.columns):
    serie_venue = serie_venue.append(cluster[col].value_counts())

serie_venue.reset_index().groupby('index').sum().sort_values(0,ascending=False)[0:10]


  after removing the cwd from sys.path.


Unnamed: 0_level_0,0
index,Unnamed: 1_level_1
Coffee Shop,5
Restaurant,4
Café,4
Japanese Restaurant,4
Sushi Restaurant,3
Gastropub,2
Italian Restaurant,2
Theater,2
Sandwich Place,2
Ramen Restaurant,2


##### --> Cluster 0 : Coffee shop, restaurants (mostly asian)

#### Defining cluster 1

In [260]:
num_cluster = 1
cluster = data4[data4['label']==num_cluster].iloc[:,:11].set_index('Postal code')

serie_venue= pd.Series()
for col in list(cluster.columns):
    serie_venue = serie_venue.append(cluster[col].value_counts())

serie_venue.reset_index().groupby('index').sum().sort_values(0,ascending=False)[0:7]

  after removing the cwd from sys.path.


Unnamed: 0_level_0,0
index,Unnamed: 1_level_1
Café,16
Bakery,15
Coffee Shop,15
Bar,13
Park,12
Italian Restaurant,10
Restaurant,7


##### --> Cluster 1 :  bakeries, bars

#### Defining cluster 2

In [262]:
num_cluster = 2
cluster = data4[data4['label']==num_cluster].iloc[:,:11].set_index('Postal code')

serie_venue= pd.Series()
for col in list(cluster.columns):
    serie_venue = serie_venue.append(cluster[col].value_counts())

serie_venue.reset_index().groupby('index').sum().sort_values(0,ascending=False)[0:7]

  after removing the cwd from sys.path.


Unnamed: 0_level_0,0
index,Unnamed: 1_level_1
Coffee Shop,9
Hotel,9
Café,9
Restaurant,7
Japanese Restaurant,6
Gym,6
American Restaurant,5


##### --> Cluster 2 : hotels, restaurants

#### Defining cluster 3

In [263]:
num_cluster = 3
cluster = data4[data4['label']==num_cluster].iloc[:,:11].set_index('Postal code')

serie_venue= pd.Series()
for col in list(cluster.columns):
    serie_venue = serie_venue.append(cluster[col].value_counts())

serie_venue.reset_index().groupby('index').sum().sort_values(0,ascending=False)[0:7]

  after removing the cwd from sys.path.


Unnamed: 0_level_0,0
index,Unnamed: 1_level_1
Italian Restaurant,9
Café,9
Park,8
Cluster,8
Coffee Shop,8
Sushi Restaurant,7
Bakery,6


##### --> Cluster 2 : Italian restauants, parks