## Segmenting and Clustering Neighborhoods in Toronto

In [127]:
import pandas as pd
from geopy.geocoders import Nominatim
import folium 
import numpy as np
import requests
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

#### Load DataFrame with Toronto Neightborhoods_GEO from previous step

In [128]:
neighborhoods = pd.read_csv('neighborhoods_with_geo.csv', index_col=0)
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"West Hill, Guildwood, Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### Chech number of unique boroughs and neighborhoods

In [129]:
print(f"The dataframe has {len(neighborhoods['Borough'].unique())} boroughs and {neighborhoods.shape[0]} neighborhoods.")

The dataframe has 11 boroughs and 103 neighborhoods.


#### Function to get latitude and longitude of any address

In [130]:
def get_lat_long(address):
    """
    Get Latitude and Longitude of address
    
    Parameters:
    address (string): address to get latitude and longitude
    
    Returns:
    float, float: latitude and latitude
    """

    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return latitude, longitude

## Create a map of Toronto with neighborhoods superimposed on top

In [131]:
latitude, longitude = get_lat_long('Toronto, Ontario')
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Define Foursquare Credentials and Version

In [132]:
CLIENT_ID = 'HT3DE4B0WDNZUMEA1131GSQBSO5T1WUK1DCZKXJJMCKVTCHV' # your Foursquare ID
CLIENT_SECRET = 'WDXPL3IW11A4PAU4025JI1XED2UXWN305G1WIQ3VQOWQJ3TE' # your Foursquare Secret
VERSION = '20180604' # Foursquare API version

Get the top `limit` venues that are in `Nighborhood` within a radius of `radius` meters

In [133]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Get all nearby venues for each neighborhood in Toronto

In [134]:
_neighborhoods_venues = getNearbyVenues(names=neighborhoods['Neighbourhood'],
                                        latitudes=neighborhoods['Latitude'],
                                        longitudes=neighborhoods['Longitude'],
                                        radius=600, limit=150)

In [135]:
_neighborhoods_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,"Malvern, Rouge",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
2,"Malvern, Rouge",43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
3,"Malvern, Rouge",43.806686,-79.194353,Lee Valley,43.803161,-79.199681,Hobby Shop
4,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar


#### Check number of unique venues categories

In [136]:
neighborhoods_venues = _neighborhoods_venues.copy()
print('There are {} uniques categories.'.format(len(neighborhoods_venues['Venue Category'].unique())))

There are 293 uniques categories.


#### One hot encoding, move column __Neighborhood__ to the first column

In [137]:
neighborhoods_onehot = pd.get_dummies(neighborhoods_venues[['Venue Category']], prefix="", prefix_sep="")
cols = list(neighborhoods_onehot.columns.values) 
cols.pop(cols.index('Neighborhood'))
neighborhoods_onehot = neighborhoods_onehot[['Neighborhood']+cols]
neighborhoods_onehot['Neighborhood'] = neighborhoods_venues['Neighborhood'] 
neighborhoods_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Udon Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Port Union, Rouge Hill, Highland Creek",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [138]:
neighborhoods_onehot.shape

(2843, 293)

#### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [139]:
neighborhoods_grouped = neighborhoods_onehot.groupby('Neighborhood').mean().reset_index()
neighborhoods_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Udon Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Albion Gardens, Silverstone, South Steeles, Hu...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011111,...,0.0,0.022222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [140]:
neighborhoods_grouped.shape

(101, 293)

#### Sort the venues in descending order

In [141]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Create the new dataframe and display the top 10 venues for each neighborhood

In [142]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
_neighborhoods_venues_sorted['Neighborhood'] = neighborhoods_grouped['Neighborhood']

for ind in np.arange(neighborhoods_grouped.shape[0]):
    _neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(neighborhoods_grouped.iloc[ind, :], num_top_venues)

_neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Latin American Restaurant,Skating Rink,Lounge,Breakfast Spot,Clothing Store,Dive Bar,Dog Run,Doner Restaurant,Donut Shop,Drugstore
1,"Albion Gardens, Silverstone, South Steeles, Hu...",Grocery Store,Fast Food Restaurant,Pizza Place,Beer Store,Sandwich Place,Fried Chicken Joint,Hardware Store,Pharmacy,Ethiopian Restaurant,Empanada Restaurant
2,"Alderwood, Long Branch",Pizza Place,Gym,Skating Rink,Dance Studio,Athletics & Sports,Gas Station,Convenience Store,Sandwich Place,Pub,Coffee Shop
3,Bayview Village,Café,Chinese Restaurant,Bank,Japanese Restaurant,Event Space,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Diner
4,Berczy Park,Coffee Shop,Restaurant,Hotel,Café,Japanese Restaurant,Cocktail Bar,Seafood Restaurant,Creperie,Vegetarian / Vegan Restaurant,Steakhouse


#### Run k-means to cluster the neighborhood into 3 clusters

In [143]:
# set number of clusters
kclusters = 3

neighborhoods_venues_sorted = _neighborhoods_venues_sorted.copy()
neighborhoods_grouped_clustering = neighborhoods_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(neighborhoods_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype=int32)

#### Add clustering labels

In [144]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
neighborhoods = neighborhoods.rename(columns={"Neighbourhood": "Neighborhood"})
neighborhoods_merged = neighborhoods

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
neighborhoods_merged = neighborhoods_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

neighborhoods_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,0.0,Fast Food Restaurant,Spa,Hobby Shop,Drugstore,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Donut Shop
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497,0.0,Moving Target,Bar,Yoga Studio,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
2,M1E,Scarborough,"West Hill, Guildwood, Morningside",43.763573,-79.188711,0.0,Pizza Place,Thrift / Vintage Store,Fried Chicken Joint,Intersection,Fast Food Restaurant,Electronics Store,Medical Center,Mexican Restaurant,Park,Pharmacy
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0.0,Coffee Shop,Business Service,Convenience Store,Korean Restaurant,Yoga Studio,Drugstore,Dive Bar,Dog Run,Doner Restaurant,Donut Shop
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0.0,Bakery,Indian Restaurant,Hakka Restaurant,Chinese Restaurant,Fried Chicken Joint,Caribbean Restaurant,Thai Restaurant,Athletics & Sports,Gas Station,Bank


#### Create map with colored neighborhoods

In [146]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = ['#ff0000', '#00ff00', '#0000ff']
# add markers to the map
markers_colors = []
neighborhoods_merged = neighborhoods_merged.dropna()
for lat, lon, poi, cluster in zip(neighborhoods_merged['Latitude'], neighborhoods_merged['Longitude'], neighborhoods_merged['Neighborhood'], neighborhoods_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
<img src="/home/eclipser/Desktop/Screenshot at 2019-12-26 14-30-11.png" alt="Alt text that describes the graphic" title="Title text" />