# Coursera Capstone Project

### This is a notebook created for the Coursera Capstone Project from the IBM Applied Data Science Capstone

# Part 1

In [1]:
# let's import a library to make the data analysis
import pandas as pd

In [2]:
path = 'http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
# let's import the data using Pandas library to convert the data in a DataFrame
df = pd.read_html(path, header=0, na_values=['Not assigned'])[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
# let's ignore the cells with a borough that is Not assigned
df.dropna(subset=['Borough'], inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [5]:
# let's check if there are cells with a Neighbourhood that is Not assigned
df[df['Neighbourhood'].isna()]

Unnamed: 0,Postcode,Borough,Neighbourhood
9,M9A,Queen's Park,


In [6]:
# let's assigned the Borough to the Neighbourhood that is Not assigned
df['Neighbourhood'].fillna(df['Borough'], inplace=True)
df[df['Neighbourhood'].isna()]

Unnamed: 0,Postcode,Borough,Neighbourhood


In [7]:
# grouping by Neighbourhoods that have the same postcode
df_postcodes = df.groupby(['Postcode','Borough']).Neighbourhood.agg([('Neighbourhood', ', '.join)])
df_postcodes.reset_index(inplace=True)
df_postcodes.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
print('The shape of the dataset is:',df_postcodes.shape) # the shape of the dataframe

The shape of the dataset is: (103, 3)


# Part 2

In [9]:
df_coord = pd.read_csv('http://cocl.us/Geospatial_data')
df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
df_coord.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
df.rename(columns={'Postcode': 'PostalCode'}, inplace=True)
df_coord.head()
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [11]:
df = pd.merge(df, df_coord, on='PostalCode')
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763


# Part 3

In [12]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import matplotlib.cm as cm # Matplotlib and associated plotting modules
import matplotlib.colors as colors # Matplotlib and associated plotting modules
from sklearn.cluster import KMeans # import k-means from clustering stage
import folium # map rendering library

#### Use geopy library to get the latitude and longitude values of Toronto City.

In [13]:
address = 'Toronto, CN'

geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 45.4414408, -73.6500774.


#### Create a map of Toronto with neighborhoods superimposed on top.

In [14]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [46]:
CLIENT_ID = 'XXXX' # your Foursquare ID
CLIENT_SECRET = 'XXXX' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: XXXX
CLIENT_SECRET:XXXX


#### Let's explore the first neighborhood in our dataframe.

In [16]:
df.rename(columns={'Neighbourhood': 'Neighborhood'}, inplace=True)
df.loc[0, 'Neighborhood']

'Parkwoods'

Get the neighborhood's latitude and longitude values.

In [17]:
neighborhood_latitude = df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Parkwoods are 43.7532586, -79.3296565.


#### Now, let's get the top 100 venues that are in Marble Hill within a radius of 500 meters.

In [18]:
# type your answer here
radius = 500
LIMIT = 100

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)
url


'https://api.foursquare.com/v2/venues/explore?client_id=X4D2O1VY0QPNU4RELECJCSB2UVAUYQNYOJCBTQBGDC0LYJIT&client_secret=DJYRZ1EQERBSC0NSAJSELZJKJPIMAKPMVAR0WK4VDADEKMIT&ll=45.4414408,-73.6500774&v=20180605&radius=500&limit=100'

In [19]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [20]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e3f2eff0cc1fd001c1cb5b2'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Lachine',
  'headerFullLocation': 'Lachine, Montreal',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 6,
  'suggestedBounds': {'ne': {'lat': 45.445940804500005,
    'lng': -73.6436758036377},
   'sw': {'lat': 45.4369407955, 'lng': -73.6564789963623}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '55f46d6c498e302d8870255e',
       'name': 'Falafel St-Jacques',
       'location': {'address': '345 Rue St-Jacques',
        'lat': 45.44336638823828,
        'lng': -73.64873195380359,
        'labeledLatLngs': [{'label': 'display',
          'lat': 45.44336

In [21]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [22]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Falafel St-Jacques,Falafel Restaurant,45.443366,-73.648732
1,Lafleur,Fast Food Restaurant,45.442367,-73.650918
2,Tim Hortons,Coffee Shop,45.442157,-73.646336
3,Le Cordon,Pizza Place,45.441905,-73.647277
4,La Belle Province,Fast Food Restaurant,45.443284,-73.648315


In [23]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

6 venues were returned by Foursquare.


#### Let's create a function to repeat the same process to all the neighborhoods with only boroughs that contain the word Toronto

In [24]:
#Create new dataset
df_toronto = df[df['Borough'].str.contains('Toronto')]
df_toronto.reset_index(inplace=True)
df_toronto.drop('index', axis=1, inplace=True)
df_toronto.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418


In [25]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe called *toronto_venues*.

In [26]:
toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Harbourfront
Queen's Park
Ryerson
Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide
King
Richmond
Dovercourt Village
Dufferin
Harbourfront East
Toronto Islands
Union Station
Little Portugal
Trinity
The Danforth West
Riverdale
Design Exchange
Toronto Dominion Centre
Brockton
Exhibition Place
Parkdale Village
The Beaches West
India Bazaar
Commerce Court
Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North
Forest Hill West
High Park
The Junction South
North Toronto West
The Annex
North Midtown
Yorkville
Parkdale
Roncesvalles
Davisville
Harbord
University of Toronto
Runnymede
Swansea
Moore Park
Summerhill East
Chinatown
Grange Park
Kensington Market
Deer Park
Forest Hill SE
Rathnelly
South Hill
Summerhill West
CN Tower
Bathurst Quay
Island airport
Harbourfront West
King and Spadina
Railway Lands
South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown
St. James Town
First Canadian Place
Underground city

In [27]:
print(toronto_venues.shape)
toronto_venues.head()

(2025, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Harbourfront,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Queen's Park,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Queen's Park,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Queen's Park,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [28]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,5,5,5,5,5,5
Bathurst Quay,100,100,100,100,100,100
Brockton,21,21,21,21,21,21
Business Reply Mail Processing Centre 969 Eastern,100,100,100,100,100,100
CN Tower,3,3,3,3,3,3
...,...,...,...,...,...,...
Underground city,42,42,42,42,42,42
Union Station,1,1,1,1,1,1
University of Toronto,21,21,21,21,21,21
Victoria Hotel,9,9,9,9,9,9


#### Let's find out how many unique categories can be curated from all the returned venues

In [29]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 202 uniques categories.


In [30]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Airport,American Restaurant,Antique Shop,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
toronto_onehot.shape

(2025, 202)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [32]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Airport,American Restaurant,Antique Shop,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,Adelaide,0.00000,0.0,0.0,0.00000,0.0,0.00,0.00,0.0,0.000000,...,0.00000,0.00,0.00,0.0,0.000000,0.0,0.0,0.00,0.0,0.0
1,Bathurst Quay,0.00000,0.0,0.0,0.00000,0.0,0.05,0.01,0.0,0.000000,...,0.00000,0.01,0.01,0.0,0.000000,0.0,0.0,0.01,0.0,0.0
2,Brockton,0.00000,0.0,0.0,0.00000,0.0,0.00,0.00,0.0,0.095238,...,0.00000,0.00,0.00,0.0,0.000000,0.0,0.0,0.00,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.00000,0.0,0.0,0.03000,0.0,0.00,0.01,0.0,0.010000,...,0.00000,0.01,0.01,0.0,0.000000,0.0,0.0,0.01,0.0,0.0
4,CN Tower,0.00000,0.0,0.0,0.00000,0.0,0.00,0.00,0.0,0.000000,...,0.00000,0.00,0.00,0.0,0.000000,0.0,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,Underground city,0.02381,0.0,0.0,0.02381,0.0,0.00,0.00,0.0,0.000000,...,0.02381,0.00,0.00,0.0,0.000000,0.0,0.0,0.00,0.0,0.0
68,Union Station,0.00000,0.0,0.0,0.00000,0.0,0.00,0.00,0.0,0.000000,...,0.00000,0.00,0.00,0.0,0.000000,0.0,0.0,0.00,0.0,0.0
69,University of Toronto,0.00000,0.0,0.0,0.00000,0.0,0.00,0.00,0.0,0.000000,...,0.00000,0.00,0.00,0.0,0.047619,0.0,0.0,0.00,0.0,0.0
70,Victoria Hotel,0.00000,0.0,0.0,0.00000,0.0,0.00,0.00,0.0,0.000000,...,0.00000,0.00,0.00,0.0,0.000000,0.0,0.0,0.00,0.0,0.0


In [33]:
toronto_grouped.shape # let's confirm the new shape

(72, 202)

#### Let's print each neighborhood along with the top 5 most common venues

In [34]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide----
                  venue  freq
0  Gym / Fitness Center   0.2
1                  Café   0.2
2      Basketball Court   0.2
3  Caribbean Restaurant   0.2
4   Japanese Restaurant   0.2


----Bathurst Quay----
                venue  freq
0         Coffee Shop  0.12
1            Aquarium  0.05
2                Café  0.04
3               Hotel  0.04
4  Italian Restaurant  0.04


----Brockton----
              venue  freq
0               Gym  0.10
1       Coffee Shop  0.10
2        Beer Store  0.10
3  Asian Restaurant  0.10
4    Sandwich Place  0.05


----Business Reply Mail Processing Centre 969 Eastern----
         venue  freq
0  Coffee Shop  0.12
1         Café  0.08
2        Hotel  0.07
3   Steakhouse  0.04
4   Restaurant  0.04


----CN Tower----
               venue  freq
0        Coffee Shop  0.33
1               Park  0.33
2  Convenience Store  0.33
3        Yoga Studio  0.00
4             Museum  0.00


----Cabbagetown----
                 venue  freq
0                 

                   venue  freq
0                    Bar  0.12
1       Asian Restaurant  0.06
2             Restaurant  0.06
3            Pizza Place  0.04
4  Vietnamese Restaurant  0.04


----Rathnelly----
                  venue  freq
0        Clothing Store  0.13
1  Fast Food Restaurant  0.08
2           Coffee Shop  0.08
3            Shoe Store  0.05
4         Women's Store  0.03


----Richmond----
                  venue  freq
0  Fast Food Restaurant  0.15
1           Pizza Place  0.15
2    Athletics & Sports  0.08
3              Bus Line  0.08
4                  Café  0.08


----Riverdale----
                        venue  freq
0               Moving Target  0.33
1  Construction & Landscaping  0.33
2                         Bar  0.33
3                 Yoga Studio  0.00
4    Mediterranean Restaurant  0.00


----Roncesvalles----
                      venue  freq
0  Mediterranean Restaurant  0.25
1               Golf Course  0.25
2                      Pool  0.25
3                   

#### Let's put that into a *pandas* dataframe

In [35]:
# First, let's write a function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [36]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Gym / Fitness Center,Basketball Court,Japanese Restaurant,Caribbean Restaurant,Café,Women's Store,Deli / Bodega,Eastern European Restaurant,Donut Shop,Dog Run
1,Bathurst Quay,Coffee Shop,Aquarium,Italian Restaurant,Café,Hotel,Restaurant,Sporting Goods Shop,Scenic Lookout,Brewery,Fried Chicken Joint
2,Brockton,Gym,Coffee Shop,Asian Restaurant,Beer Store,Supermarket,Italian Restaurant,Japanese Restaurant,Fast Food Restaurant,Discount Store,Dim Sum Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Coffee Shop,Café,Hotel,Steakhouse,Restaurant,Seafood Restaurant,Gastropub,American Restaurant,Bar,Deli / Bodega
4,CN Tower,Coffee Shop,Park,Convenience Store,Women's Store,Dance Studio,Eastern European Restaurant,Donut Shop,Dog Run,Discount Store,Diner


## Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [37]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [38]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,,,,,,,,,,,
1,M4A,North York,Victoria Village,43.725882,-79.315572,,,,,,,,,,,
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,0.0,Park,Food & Drink Shop,Women's Store,Dance Studio,Eastern European Restaurant,Donut Shop,Dog Run,Discount Store,Diner,Dim Sum Restaurant
3,M6A,North York,Lawrence Heights,43.718518,-79.464763,,,,,,,,,,,
4,M6A,North York,Lawrence Manor,43.718518,-79.464763,,,,,,,,,,,


Finally, let's visualize the resulting clusters

In [39]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        #color=rainbow[cluster-1],
        fill=True,
        #fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

#### Cluster 1

In [40]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Downtown Toronto,0.0,Park,Food & Drink Shop,Women's Store,Dance Studio,Eastern European Restaurant,Donut Shop,Dog Run,Discount Store,Diner,Dim Sum Restaurant
5,Downtown Toronto,0.0,Coffee Shop,Pizza Place,Portuguese Restaurant,Hockey Arena,Intersection,Curling Ice,Donut Shop,Dog Run,Discount Store,Diner
6,Queen's Park,0.0,Coffee Shop,Pizza Place,Portuguese Restaurant,Hockey Arena,Intersection,Curling Ice,Donut Shop,Dog Run,Discount Store,Diner
12,Downtown Toronto,0.0,Coffee Shop,Park,Pub,Café,Bakery,Mexican Restaurant,Breakfast Spot,Restaurant,Chocolate Shop,Performing Arts Venue
13,Downtown Toronto,0.0,Furniture / Home Store,Clothing Store,Coffee Shop,Miscellaneous Shop,Boutique,Event Space,Vietnamese Restaurant,Accessories Store,Dance Studio,Donut Shop
...,...,...,...,...,...,...,...,...,...,...,...,...
189,Downtown Toronto,0.0,Furniture / Home Store,Clothing Store,Coffee Shop,Construction & Landscaping,Miscellaneous Shop,Boutique,Snack Place,Event Space,Park,Accessories Store
190,Downtown Toronto,0.0,Snack Place,Airport,Construction & Landscaping,Park,Gluten-free Restaurant,Curling Ice,Donut Shop,Dog Run,Discount Store,Diner
191,Downtown Toronto,0.0,Greek Restaurant,Coffee Shop,Italian Restaurant,Restaurant,Furniture / Home Store,Ice Cream Shop,Yoga Studio,Bubble Tea Shop,Brewery,Diner
195,Downtown Toronto,0.0,Greek Restaurant,Coffee Shop,Italian Restaurant,Restaurant,Furniture / Home Store,Ice Cream Shop,Yoga Studio,Bubble Tea Shop,Brewery,Diner


#### Cluster 2

In [41]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
61,Downtown Toronto,1.0,Golf Course,Women's Store,Curling Ice,Eastern European Restaurant,Donut Shop,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop
62,Downtown Toronto,1.0,Golf Course,Women's Store,Curling Ice,Eastern European Restaurant,Donut Shop,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop
63,West Toronto,1.0,Golf Course,Women's Store,Curling Ice,Eastern European Restaurant,Donut Shop,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop
64,West Toronto,1.0,Golf Course,Women's Store,Curling Ice,Eastern European Restaurant,Donut Shop,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop
71,East Toronto,1.0,Golf Course,Women's Store,Curling Ice,Eastern European Restaurant,Donut Shop,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop


#### Cluster 3

In [42]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
129,Central Toronto,2.0,Coffee Shop,Korean Restaurant,Women's Store,Deli / Bodega,Electronics Store,Eastern European Restaurant,Donut Shop,Dog Run,Discount Store,Diner
169,Downtown Toronto,2.0,Coffee Shop,Hobby Shop,Chinese Restaurant,Department Store,Women's Store,Deli / Bodega,Electronics Store,Eastern European Restaurant,Donut Shop,Dog Run
182,Downtown Toronto,2.0,Coffee Shop,Hobby Shop,Chinese Restaurant,Department Store,Women's Store,Deli / Bodega,Electronics Store,Eastern European Restaurant,Donut Shop,Dog Run
183,Downtown Toronto,2.0,Coffee Shop,Hobby Shop,Chinese Restaurant,Department Store,Women's Store,Deli / Bodega,Electronics Store,Eastern European Restaurant,Donut Shop,Dog Run


#### Cluster 4

In [43]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
153,Downtown Toronto,3.0,Playground,Women's Store,Curling Ice,Eastern European Restaurant,Donut Shop,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop


#### Cluster 5

In [44]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
72,East Toronto,4.0,Construction & Landscaping,Moving Target,Bar,Women's Store,Deli / Bodega,Electronics Store,Eastern European Restaurant,Donut Shop,Dog Run,Discount Store
73,Downtown Toronto,4.0,Construction & Landscaping,Moving Target,Bar,Women's Store,Deli / Bodega,Electronics Store,Eastern European Restaurant,Donut Shop,Dog Run,Discount Store
74,Downtown Toronto,4.0,Construction & Landscaping,Moving Target,Bar,Women's Store,Deli / Bodega,Electronics Store,Eastern European Restaurant,Donut Shop,Dog Run,Discount Store


In [45]:
# Count the number of neighborhoods per cluster
toronto_merged.groupby('Cluster Labels').count()['Neighborhood']

Cluster Labels
0.0    62
1.0     5
2.0     4
3.0     1
4.0     3
Name: Neighborhood, dtype: int64