## Load the data

Install and import relevant libraries

In [23]:
!conda install folium -c conda-forge
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import folium
import requests
from tqdm import tqdm
from collections import deque
import matplotlib.cm as cm
import matplotlib.colors as colors
print('done')

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.8.0                      py_0    conda-forge
done


Import previously saved Toronto Geospatial CSV

In [25]:
TData = pd.read_csv("torontoData.csv")
TData.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Let's explore the data we have;

In [26]:
print('Toronto has {} boroughs and {} neighborhoods.'.format(
        len(TData['Borough'].unique()),
        TData.shape[0]
    )
)

Toronto has 11 boroughs and 103 neighborhoods.


Now we need Toronto's coordinates to build the map later;

In [27]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print("Toronto's coordinates are {}, {}.".format(latitude, longitude))

Toronto's coordinates are 43.653963, -79.387207.


In [28]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

neighborhoods = TData

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'],
                                           neighborhoods['Longitude'],
                                           neighborhoods['Borough'],
                                           neighborhoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

For the sake of this assignment let's limit the boroughs to one of the biggest in Torornto, Scarborough.

In [29]:
scarborough_data = TData[TData['Borough'] == 'Scarborough'].reset_index(drop=True)
scarborough_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Foursqaure configuration

Next, let's configure Foursquare access;

In [30]:
CLIENT_ID = 'xxx'
CLIENT_SECRET = 'xxx' 
VERSION = '20190303'

Then define a function to get the top venues around each neighbourhood;

In [31]:
def getNearbyVenues(names, latitudes, longitudes, radius=2000, LIMIT = 100):
    
    venues_list=[]
    for name, lat, lng in tqdm(zip(names, latitudes, longitudes), total = names.size):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Now let's run the function on our chosen borough;

In [32]:
Svenues = getNearbyVenues(scarborough_data.Neighbourhood,
                            scarborough_data.Latitude,
                            scarborough_data.Longitude)

100%|██████████| 17/17 [00:06<00:00,  2.56it/s]


check the size of the dataframe and let's see some of it;

In [33]:
print(Svenues.shape)
Svenues.head()

(1112, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge,Malvern",43.806686,-79.194353,African Rainforest Pavilion,43.817725,-79.183433,Zoo Exhibit
1,"Rouge,Malvern",43.806686,-79.194353,Toronto Pan Am Sports Centre,43.790623,-79.193869,Athletics & Sports
2,"Rouge,Malvern",43.806686,-79.194353,Toronto Zoo,43.820582,-79.181551,Zoo
3,"Rouge,Malvern",43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
4,"Rouge,Malvern",43.806686,-79.194353,Polar Bear Exhibit,43.823372,-79.185145,Zoo


We can see how many unique venues there are;

In [34]:
print('There are {} uniques categories.'.format(len(Svenues['Venue Category'].unique())))

There are 153 uniques categories.


Let's use one hot encoding on the venue categories;

In [37]:
S_OHE = pd.get_dummies(Svenues["Venue Category"],
                             prefix = "",
                             prefix_sep = "")

S_OHE["Neighborhood"] = Svenues["Neighborhood"]


nindex = list(S_OHE.columns).index("Neighborhood")
cols = deque(S_OHE.columns)
cols.rotate(-nindex)
cols = list(cols)
S_OHE = S_OHE[cols]

S_OHE.head()

Unnamed: 0,Neighborhood,Noodle House,Optical Shop,Other Great Outdoors,Paper / Office Supplies Store,Park,Pet Store,Pharmacy,Pizza Place,Playground,...,Malay Restaurant,Market,Medical Center,Mediterranean Restaurant,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Mobile Phone Shop,Movie Theater,Music Store
0,"Rouge,Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Rouge,Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Rouge,Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Rouge,Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Rouge,Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We compute the average number of venue categories per neighborhood;

In [38]:
S_grouped = S_OHE.groupby('Neighborhood').mean().reset_index()
S_grouped.head()

Unnamed: 0,Neighborhood,Noodle House,Optical Shop,Other Great Outdoors,Paper / Office Supplies Store,Park,Pet Store,Pharmacy,Pizza Place,Playground,...,Malay Restaurant,Market,Medical Center,Mediterranean Restaurant,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Mobile Phone Shop,Movie Theater,Music Store
0,Agincourt,0.01,0.0,0.0,0.0,0.01,0.0,0.05,0.02,0.0,...,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0
1,"Agincourt North,L'Amoreaux East,Milliken,Steel...",0.021978,0.0,0.0,0.0,0.032967,0.0,0.032967,0.043956,0.0,...,0.010989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010989,0.0
2,"Birch Cliff,Cliffside West",0.0,0.0,0.0,0.0,0.046512,0.0,0.046512,0.046512,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Cedarbrae,0.0,0.0,0.0,0.01,0.01,0.0,0.03,0.03,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.01
4,"Clairlea,Golden Mile,Oakridge",0.0,0.010526,0.0,0.0,0.0,0.021053,0.010526,0.042105,0.0,...,0.0,0.0,0.0,0.0,0.010526,0.0,0.010526,0.0,0.010526,0.0


For clustering let us select the 10 most frequent venue categories;
first we define a function to get the N number of top venue categories

In [39]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Then we run it;

In [40]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = S_grouped['Neighborhood']

for ind in np.arange(S_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(S_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Chinese Restaurant,Coffee Shop,Pharmacy,Bakery,Shopping Mall,Cantonese Restaurant,Sandwich Place,Restaurant,Indian Restaurant,Breakfast Spot
1,"Agincourt North,L'Amoreaux East,Milliken,Steel...",Chinese Restaurant,Coffee Shop,Pizza Place,Japanese Restaurant,Hong Kong Restaurant,Park,Pharmacy,Dessert Shop,Indian Restaurant,Vietnamese Restaurant
2,"Birch Cliff,Cliffside West",Coffee Shop,Grocery Store,Bank,Pharmacy,Pizza Place,Fast Food Restaurant,Park,Beer Store,Dog Run,Fish & Chips Shop
3,Cedarbrae,Coffee Shop,Clothing Store,Gym,Fast Food Restaurant,Sandwich Place,Restaurant,Indian Restaurant,Wings Joint,Pizza Place,Pharmacy
4,"Clairlea,Golden Mile,Oakridge",Coffee Shop,Fast Food Restaurant,Sandwich Place,Burger Joint,Pizza Place,Burrito Place,Chinese Restaurant,Grocery Store,Department Store,Hardware Store


## Clustering the neighborhoods

Before clustering, let us use a PCA decomposition to reduce the noise of the signal and improve clustering efficiency

In [41]:
pca = PCA(.95)
Toronto_grouped_clustering = pca.fit_transform(S_grouped.drop('Neighborhood', 1))
Toronto_grouped_clustering = S_grouped.drop('Neighborhood', 1)

Carry out the KMeans clustering

In [42]:
# set number of clusters
kclusters = 4

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
print(kmeans.labels_[0:10])
print(kmeans.labels_.shape)

[3 3 0 0 0 0 0 0 0 0]
(17,)


Let us now create a dataframe that containes the neighborhood, the location and the cluster information, together with the top 10 venues

In [44]:
S_grouped["Cluster Labels"] = kmeans.labels_

# add clustering labels
Toronto_combined = TData.merge(S_grouped, left_on = "Neighbourhood", right_on = "Neighborhood", how = "outer")

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_combined = Toronto_combined.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Toronto_combined["Cluster Labels"] = Toronto_combined["Cluster Labels"].fillna(5).astype("int")

Toronto_combined.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Neighborhood,Noodle House,Optical Shop,Other Great Outdoors,Paper / Office Supplies Store,...,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353,"Rouge,Malvern",0.0,0.0,0.021739,0.021739,...,Zoo Exhibit,Fast Food Restaurant,Pizza Place,Zoo,Bus Station,Caribbean Restaurant,Fried Chicken Joint,Theme Park Ride / Attraction,Grocery Store,Gym
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,"Highland Creek,Rouge Hill,Port Union",0.0,0.0,0.0,0.0,...,Breakfast Spot,Coffee Shop,Sandwich Place,Pizza Place,Fried Chicken Joint,Fish & Chips Shop,Hardware Store,Gym,Grocery Store,Supermarket
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711,"Guildwood,Morningside,West Hill",0.0,0.0,0.0,0.0,...,Pizza Place,Coffee Shop,Breakfast Spot,Greek Restaurant,Park,Fast Food Restaurant,Gym / Fitness Center,Grocery Store,Sports Bar,Supermarket
3,M1G,Scarborough,Woburn,43.770992,-79.216917,Woburn,0.0,0.0,0.0,0.016393,...,Coffee Shop,Fast Food Restaurant,Sandwich Place,Pharmacy,Indian Restaurant,Discount Store,Pizza Place,Chinese Restaurant,Furniture / Home Store,Supermarket
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,Cedarbrae,0.0,0.0,0.0,0.01,...,Coffee Shop,Clothing Store,Gym,Fast Food Restaurant,Sandwich Place,Restaurant,Indian Restaurant,Wings Joint,Pizza Place,Pharmacy


In [45]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

kclusters = kclusters + 1

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_combined['Latitude'],
                                  Toronto_combined['Longitude'],
                                  Toronto_combined['Neighborhood'],
                                  Toronto_combined['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Analyze Clusters

### Cluster 1

In [46]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 0, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Breakfast Spot,Coffee Shop,Sandwich Place,Pizza Place,Fried Chicken Joint,Fish & Chips Shop,Hardware Store,Gym,Grocery Store,Supermarket
2,Pizza Place,Coffee Shop,Breakfast Spot,Greek Restaurant,Park,Fast Food Restaurant,Gym / Fitness Center,Grocery Store,Sports Bar,Supermarket
3,Coffee Shop,Fast Food Restaurant,Sandwich Place,Pharmacy,Indian Restaurant,Discount Store,Pizza Place,Chinese Restaurant,Furniture / Home Store,Supermarket
4,Coffee Shop,Clothing Store,Gym,Fast Food Restaurant,Sandwich Place,Restaurant,Indian Restaurant,Wings Joint,Pizza Place,Pharmacy
5,Fast Food Restaurant,Coffee Shop,Pizza Place,Sandwich Place,Pharmacy,Grocery Store,Liquor Store,Big Box Store,Theater,Discount Store


Cluster 1 seems to be dominated by Restaurants and Coffee shops.

### Cluster 2

In [47]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 1, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Zoo Exhibit,Fast Food Restaurant,Pizza Place,Zoo,Bus Station,Caribbean Restaurant,Fried Chicken Joint,Theme Park Ride / Attraction,Grocery Store,Gym


Cluster 2 is all about the Zoo.

### Cluster 3

In [48]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 2, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,Golf Course,Trail,Grocery Store,Sculpture Garden,Playground,Farm,Warehouse Store,Zoo Exhibit,Zoo,Yoga Studio


Cluster 3 is recreational with the Golf course and the Trail.

### Cluster 4

In [49]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 3, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,Chinese Restaurant,Coffee Shop,Pharmacy,Bakery,Shopping Mall,Cantonese Restaurant,Sandwich Place,Restaurant,Indian Restaurant,Breakfast Spot
14,Chinese Restaurant,Coffee Shop,Pizza Place,Japanese Restaurant,Hong Kong Restaurant,Park,Pharmacy,Dessert Shop,Indian Restaurant,Vietnamese Restaurant
15,Chinese Restaurant,Coffee Shop,Bakery,Pharmacy,Fast Food Restaurant,Park,Pizza Place,Sandwich Place,Japanese Restaurant,Vietnamese Restaurant


Cluster 4 is predominantly Asian restaurants and coffee shops.