### Import useful libraries

In [99]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import requests
from collections import deque

### Import the Toronto data with the geographical information as build for the second part of the assignment
Recall that the geograpical coordinate of Toronto are 43.653963, -79.387207

In [100]:
df_toronto_geo = pd.read_csv('df_toronto_geo.csv').set_index("Postcode")
df_toronto_geo.head()

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Configure Fursquare access

In [101]:
CLIENT_ID = 'LTNFDJLIM1CFBQMNBQYP2NTWQJVDJI0YCFUD2CW30JGTEUOI'
CLIENT_SECRET = 'CVHEUGRHTT0CTFPU55GFKM5SXKP20VMK2MARSPQLAIXZ2UCF'
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: LTNFDJLIM1CFBQMNBQYP2NTWQJVDJI0YCFUD2CW30JGTEUOI
CLIENT_SECRET:CVHEUGRHTT0CTFPU55GFKM5SXKP20VMK2MARSPQLAIXZ2UCF


### Get the the names and locations of the neighborhoods in Toronto and obtains the 50 top venues around.
To this aim, I define the function getVenues

In [102]:
def getVenues(names, latitudes, longitudes, r, lim):
    venues_lst=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, r, lim)
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_lst.append([(name, lat, lng, 
            v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    venues = pd.DataFrame([item for venues_lst in venues_lst for item in venues_lst])
    venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 
                  'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return venues

r = 1000
lim = 100
toronto_venues = getVenues(df_toronto_geo.Neighbourhood, df_toronto_geo.Latitude, df_toronto_geo.Longitude, r, lim)

### Check the amount of venues per neighboorhood and the total amount of unique categories in my data

In [103]:
toronto_venues.groupby("Neighborhood").Venue.count().sort_values(ascending=False).head()
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 337 uniques categories.


### Create a dataframe with a one hot enconding of the venue category information

In [104]:
toronto_venues_ohe = pd.get_dummies(toronto_venues["Venue Category"],
                             prefix = "",
                             prefix_sep = "")

toronto_venues_ohe["Neighborhood"] = toronto_venues["Neighborhood"]


nindex = list(toronto_venues_ohe.columns).index("Neighborhood")
cols = deque(toronto_venues_ohe.columns)
cols.rotate(-nindex)
cols = list(cols)
toronto_venues_ohe = toronto_venues_ohe[cols]

toronto_venues_ohe.head()

Unnamed: 0,Neighborhood,New American Restaurant,Nightclub,Noodle House,Office,Opera House,Optical Shop,Organic Grocery,Other Great Outdoors,Other Repair Shop,...,Monument / Landmark,Moroccan Restaurant,Motorcycle Shop,Movie Theater,Moving Target,Museum,Music School,Music Store,Music Venue,Nail Salon
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Compute the average number of venue categories per neighborhood

In [105]:
df_toronto_neigh = toronto_venues_ohe.groupby('Neighborhood').mean().reset_index()
df_toronto_neigh.head()

Unnamed: 0,Neighborhood,New American Restaurant,Nightclub,Noodle House,Office,Opera House,Optical Shop,Organic Grocery,Other Great Outdoors,Other Repair Shop,...,Monument / Landmark,Moroccan Restaurant,Motorcycle Shop,Movie Theater,Moving Target,Museum,Music School,Music Store,Music Venue,Nail Salon
0,"Adelaide, King, Richmond",0.01,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.02,0.0,0.01,0.0,0.0,0.0,0.0
1,Agincourt,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.035714,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Select only the N most frequent venue categories per neighborhood

In [106]:
def top_venues(row, n_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:n_top_venues]


n_top_venues = 5

columns = ['Neighborhood']
for i in np.arange(n_top_venues):
    columns.append('{} top venue'.format(i+1))

venues_sorted = pd.DataFrame(columns=columns)
venues_sorted['Neighborhood'] = df_toronto_neigh['Neighborhood']

for i in np.arange(df_toronto_neigh.shape[0]):
    venues_sorted.iloc[i, 1:] = top_venues(df_toronto_neigh.iloc[i, :], n_top_venues)
    
venues_sorted.head()

Unnamed: 0,Neighborhood,1 top venue,2 top venue,3 top venue,4 top venue,5 top venue
0,"Adelaide, King, Richmond",Café,Coffee Shop,Theater,Hotel,Concert Hall
1,Agincourt,Chinese Restaurant,Shopping Mall,Pizza Place,Caribbean Restaurant,Coffee Shop
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Chinese Restaurant,Pharmacy,Bakery,Park,Pizza Place
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Pizza Place,Grocery Store,Bus Line,Park,Coffee Shop
4,"Alderwood, Long Branch",Discount Store,Pizza Place,Grocery Store,Pharmacy,Donut Shop


### Cluster the neighborhood

In [107]:
# number of clusters
k = 5

neighborhoods = df_toronto_neigh["Neighborhood"]
df_toronto_neigh = df_toronto_neigh.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=k, random_state=0).fit(df_toronto_neigh)

# check cluster labels generated for each row in the dataframe
print(kmeans.labels_[0:5])
print(kmeans.labels_.shape)

[3 0 0 0 0]
(102,)


### Create the dataframe containing neighborhood, location, cluster and the top five venues

In [108]:
df_toronto_neigh["Neighborhood"] = neighborhoods
df_toronto_neigh["Cluster Labels"] = kmeans.labels_

toronto_final = df_toronto_geo.merge(df_toronto_neigh, left_on = "Neighbourhood", right_on = "Neighborhood", how = "outer")
toronto_final = toronto_final.join(venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_final["Cluster Labels"] = toronto_final["Cluster Labels"].fillna(5).astype("int")

toronto_final.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,New American Restaurant,Nightclub,Noodle House,Office,Opera House,Optical Shop,...,Music Store,Music Venue,Nail Salon,Neighborhood,Cluster Labels,1 top venue,2 top venue,3 top venue,4 top venue,5 top venue
0,Scarborough,"Rouge, Malvern",43.806686,-79.194353,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,"Rouge, Malvern",0,Fast Food Restaurant,Coffee Shop,Fruit & Vegetable Store,Park,Sandwich Place
1,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,"Highland Creek, Rouge Hill, Port Union",2,Park,Burger Joint,Breakfast Spot,Playground,Italian Restaurant
2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,"Guildwood, Morningside, West Hill",0,Pizza Place,Fast Food Restaurant,Coffee Shop,Supermarket,Sports Bar
3,Scarborough,Woburn,43.770992,-79.216917,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,Woburn,0,Coffee Shop,Fast Food Restaurant,Pharmacy,Chinese Restaurant,Electronics Store
4,Scarborough,Cedarbrae,43.773136,-79.239476,0.0,0.0,0.0,0.0,0.0,0.0,...,0.035714,0.0,0.0,Cedarbrae,0,Bakery,Coffee Shop,Indian Restaurant,Pharmacy,Burger Joint


### Cluster analysis

### Cluster 1: food and services

In [109]:
toronto_final.loc[toronto_final['Cluster Labels'] == 0, 
                     "1 top venue":"5 top venue"].head()

Unnamed: 0,1 top venue,2 top venue,3 top venue,4 top venue,5 top venue
0,Fast Food Restaurant,Coffee Shop,Fruit & Vegetable Store,Park,Sandwich Place
2,Pizza Place,Fast Food Restaurant,Coffee Shop,Supermarket,Sports Bar
3,Coffee Shop,Fast Food Restaurant,Pharmacy,Chinese Restaurant,Electronics Store
4,Bakery,Coffee Shop,Indian Restaurant,Pharmacy,Burger Joint
5,Fast Food Restaurant,Convenience Store,Restaurant,Sandwich Place,Pizza Place


### Cluster 2: open air

In [110]:
toronto_final.loc[toronto_final['Cluster Labels'] == 1, 
                     "1 top venue":"5 top venue"].head()

Unnamed: 0,1 top venue,2 top venue,3 top venue,4 top venue,5 top venue
102,Coffee Shop,Dog Run,Yoga Studio,Video Store,Vietnamese Restaurant


### Cluster 3: sport and beauty

In [111]:
toronto_final.loc[toronto_final['Cluster Labels'] == 2, 
                     "1 top venue":"5 top venue"].head()

Unnamed: 0,1 top venue,2 top venue,3 top venue,4 top venue,5 top venue
1,Park,Burger Joint,Breakfast Spot,Playground,Italian Restaurant
20,Park,Pool,Nail Salon,Video Game Store,Vietnamese Restaurant
31,Park,Grocery Store,Vietnamese Restaurant,Bank,Spa
91,Italian Restaurant,Park,Ice Cream Shop,Eastern European Restaurant,Shopping Mall
94,Park,Pizza Place,Bank,Mexican Restaurant,Pharmacy


### Cluster 4: coffee and ethnic food

In [112]:
toronto_final.loc[toronto_final['Cluster Labels'] == 3, 
                     "1 top venue":"5 top venue"].head()

Unnamed: 0,1 top venue,2 top venue,3 top venue,4 top venue,5 top venue
9,Restaurant,General Entertainment,Bank,Fast Food Restaurant,Gym Pool
11,Middle Eastern Restaurant,Pizza Place,Grocery Store,Supermarket,Flea Market
19,Bank,Japanese Restaurant,Skate Park,Fast Food Restaurant,Grocery Store
21,Café,Korean Restaurant,Pizza Place,Middle Eastern Restaurant,Coffee Shop
22,Coffee Shop,Japanese Restaurant,Ramen Restaurant,Bubble Tea Shop,Korean Restaurant


### Cluster 5: various

In [113]:
toronto_final.loc[toronto_final['Cluster Labels'] == 4, 
                     "1 top venue":"5 top venue"].head()

Unnamed: 0,1 top venue,2 top venue,3 top venue,4 top venue,5 top venue
32,Vietnamese Restaurant,Baseball Field,Restaurant,Zoo,Warehouse Store
