## Segmenting and Clustering Neighborhoods in Toronto
### Maria Hidalgo
#### Part III

##### Exploring and clustering the neighborhoods in Toronto.

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup


!pip install geopy
!pip install folium
print("Installed!")

print("Imported Data!")

Collecting folium
  Downloading folium-0.11.0-py2.py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 2.6 MB/s eta 0:00:011
Collecting branca>=0.3.0
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0
Installed!
Imported Data!


In [2]:
from pandas.io.json import json_normalize 
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'   
extracting_data = requests.get(url).text                                  # Make a GET request to fetch the raw HTML content
Postal_Code_Data = BeautifulSoup(extracting_data, 'lxml')                 # Parse the html content

In [4]:
print(Postal_Code_Data.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"76b13534-5269-4f00-844f-009eecbb5619","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":979555370,"wgRevisionId":979555370,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Communicati

In [5]:
table_pc = Postal_Code_Data.find('table')       #finding table scraping data 
category = table_pc.find_all('td')

postcode = []
borough = []
neighborhood = []

for i in range(0, len(category), 3):
    postcode.append(category[i].text.strip())
    borough.append(category[i+1].text.strip())
    neighborhood.append(category[i+2].text.strip())
        
df_pc = pd.DataFrame(data=[postcode, borough, neighborhood]).transpose()
df_pc.columns = ['Postcode', 'Borough', 'Neighborhood']
df_pc.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
df_pc['Borough'].replace('Not assigned', np.nan, inplace=True) #replace Not assigned by NAN
df_pc.dropna(subset=['Borough'], inplace=True)                 #Drop NAN when true
df_pc.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
df_pc_1 = df_pc.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()        #ordering ascendant
df_pc_1.columns = ['Postcode', 'Borough', 'Neighborhood']
df_pc_1

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [8]:
df_pc_1['Neighborhood'].replace('Not assigned', 'Borough', inplace=True)
df_pc_1

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [9]:
df_pc_1.shape #Shape of Data

(103, 3)

In [10]:
#In order to get the latitude and the longitude coordinates of each neighborhood:
df_geospatial = pd.read_csv('http://cocl.us/Geospatial_data')
df_geospatial.columns = ['Postcode', 'Latitude', 'Longitude']

In [11]:
df_coord = pd.merge(df_pc_1, df_geospatial, on=['Postcode'], how='inner')

df_coord_LL = df_coord[['Borough', 'Neighborhood', 'Postcode', 'Latitude', 'Longitude']].copy()

df_coord_LL.head()

Unnamed: 0,Borough,Neighborhood,Postcode,Latitude,Longitude
0,Scarborough,"Malvern, Rouge",M1B,43.806686,-79.194353
1,Scarborough,"Rouge Hill, Port Union, Highland Creek",M1C,43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,Scarborough,Woburn,M1G,43.770992,-79.216917
4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [12]:
import json # library to handle JSON files
 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [13]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of the City of Toronto are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of the City of Toronto are 43.6534817, -79.3839347.


In [14]:
#Creating the map of Toronto with Foursquare
import folium
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)
for lat, lng, borough, neighborhood in zip(df_coord_LL['Latitude'], df_coord_LL['Longitude'], df_coord_LL['Borough'], df_coord_LL['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#87cefa',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto)
map_toronto

## 1. Exploring Neighborhoods in Toronto

In [16]:
#Now I am going to show the coordenates and the map for one of the neighborhood "YORK". It will be the neighborhood analized in this activity.

york_data = df_coord_LL[df_coord_LL['Borough'] == 'York'].reset_index(drop=True)
york_data.head()

Unnamed: 0,Borough,Neighborhood,Postcode,Latitude,Longitude
0,York,Humewood-Cedarvale,M6C,43.693781,-79.428191
1,York,Caledonia-Fairbanks,M6E,43.689026,-79.453512
2,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",M6M,43.691116,-79.476013
3,York,"Runnymede, The Junction North",M6N,43.673185,-79.487262
4,York,Weston,M9N,43.706876,-79.518188


In [17]:
address = 'York, Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of York are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of York are 43.6896191, -79.479188.


In [18]:
CLIENT_ID = '5QA0VL4IFTFWSTUZTOIPST5BVDN3TMU4R3LQCZB1LX4AAQXV' # your Foursquare ID
CLIENT_SECRET = 'BMYFMJZ5L0JA1ZYIE41G1V4CAAZ44ZHYRRVEUVHJ2FJMNRHB' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 5QA0VL4IFTFWSTUZTOIPST5BVDN3TMU4R3LQCZB1LX4AAQXV
CLIENT_SECRET:BMYFMJZ5L0JA1ZYIE41G1V4CAAZ44ZHYRRVEUVHJ2FJMNRHB


In [22]:
york_data.loc[0, 'Neighborhood'] #the first neighborhood in York is

'Humewood-Cedarvale'

In [23]:
neighborhood_latitude_1 = york_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude_1 = york_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = york_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude_1, 
                                                               neighborhood_longitude_1))

Latitude and longitude values of Humewood-Cedarvale are 43.6937813, -79.42819140000002.


In [24]:
search_query = 'Humewood-Cedarvale'
radius = 500
print(search_query + ' .... OK!')

Humewood-Cedarvale .... OK!


In [25]:
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, neighborhood_latitude_1, neighborhood_longitude_1, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?client_id=5QA0VL4IFTFWSTUZTOIPST5BVDN3TMU4R3LQCZB1LX4AAQXV&client_secret=BMYFMJZ5L0JA1ZYIE41G1V4CAAZ44ZHYRRVEUVHJ2FJMNRHB&v=20180605&ll=43.6937813,-79.42819140000002&radius=500&limit=100'

In [26]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5fb06ddb4467db32a39984e9'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Cedarvale',
  'headerFullLocation': 'Cedarvale, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 5,
  'suggestedBounds': {'ne': {'lat': 43.6982813045, 'lng': -79.4219793104081},
   'sw': {'lat': 43.689281295499995, 'lng': -79.43440348959193}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b0afc19f964a520212b23e3',
       'name': 'Cedarvale Park',
       'location': {'address': '100 glen cedar',
        'crossStreet': 'Strathearn',
        'lat': 43.692534923091934,
        'lng': -79.42870527613704,
        'labeledLatLngs': [{'label': 'displ

In [27]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [28]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  app.launch_new_instance()


Unnamed: 0,name,categories,lat,lng
0,Cedarvale Park,Field,43.692535,-79.428705
1,Cedarvale Ravine,Trail,43.690188,-79.426106
2,Cedarvale Dog Park,Dog Run,43.692036,-79.429491
3,Cedarvale Tennis Courts,Tennis Court,43.692744,-79.432244
4,Phil White Arena,Hockey Arena,43.691303,-79.431761


In [29]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

5 venues were returned by Foursquare.


## 2. Exploring Neighborhoods in York

In [30]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [31]:
#York_Venues
york_venues=getNearbyVenues(names=york_data['Neighborhood'],
                                   latitudes=york_data['Latitude'],
                                   longitudes=york_data['Longitude']
                                  )

Humewood-Cedarvale
Caledonia-Fairbanks
Del Ray, Mount Dennis, Keelsdale and Silverthorn
Runnymede, The Junction North
Weston


In [32]:
print(york_venues.shape)
york_venues.head()

(20, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Humewood-Cedarvale,43.693781,-79.428191,Cedarvale Park,43.692535,-79.428705,Field
1,Humewood-Cedarvale,43.693781,-79.428191,Cedarvale Ravine,43.690188,-79.426106,Trail
2,Humewood-Cedarvale,43.693781,-79.428191,Cedarvale Dog Park,43.692036,-79.429491,Dog Run
3,Humewood-Cedarvale,43.693781,-79.428191,Cedarvale Tennis Courts,43.692744,-79.432244,Tennis Court
4,Humewood-Cedarvale,43.693781,-79.428191,Phil White Arena,43.691303,-79.431761,Hockey Arena


In [33]:
#How many venues for each Neighborhood
york_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Caledonia-Fairbanks,4,4,4,4,4,4
"Del Ray, Mount Dennis, Keelsdale and Silverthorn",5,5,5,5,5,5
Humewood-Cedarvale,5,5,5,5,5,5
"Runnymede, The Junction North",4,4,4,4,4,4
Weston,2,2,2,2,2,2


In [34]:
#unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(york_venues['Venue Category'].unique())))

There are 17 uniques categories.


## 3. Analyzing Each Neighborhood

In [35]:
# one hot encoding
york_onehot = pd.get_dummies(york_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
york_onehot['Neighborhood'] = york_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [york_onehot.columns[-1]] + list(york_onehot.columns[:-1])
york_onehot = york_onehot[fixed_columns]

york_onehot.head()

Unnamed: 0,Neighborhood,Bar,Breakfast Spot,Brewery,Bus Line,Convenience Store,Discount Store,Dog Run,Field,Hockey Arena,Park,Pool,Restaurant,Sandwich Place,Tennis Court,Trail,Turkish Restaurant,Women's Store
0,Humewood-Cedarvale,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,Humewood-Cedarvale,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,Humewood-Cedarvale,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,Humewood-Cedarvale,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,Humewood-Cedarvale,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [36]:
york_onehot.shape

(20, 18)

### grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [37]:
york_grouped = york_onehot.groupby('Neighborhood').mean().reset_index()
york_grouped

Unnamed: 0,Neighborhood,Bar,Breakfast Spot,Brewery,Bus Line,Convenience Store,Discount Store,Dog Run,Field,Hockey Arena,Park,Pool,Restaurant,Sandwich Place,Tennis Court,Trail,Turkish Restaurant,Women's Store
0,Caledonia-Fairbanks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.25,0.0,0.0,0.0,0.0,0.0,0.25
1,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",0.2,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.2,0.0
2,Humewood-Cedarvale,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0
3,"Runnymede, The Junction North",0.0,0.25,0.25,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Weston,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
york_grouped.shape

(5, 18)

### Each neighborhood along with the top 5 most common venues

In [39]:
num_top_venues = 5

for hood in york_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = york_grouped[york_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Caledonia-Fairbanks----
                venue  freq
0                Park  0.50
1       Women's Store  0.25
2                Pool  0.25
3  Turkish Restaurant  0.00
4               Trail  0.00


----Del Ray, Mount Dennis, Keelsdale and Silverthorn----
                venue  freq
0                 Bar   0.2
1  Turkish Restaurant   0.2
2      Discount Store   0.2
3      Sandwich Place   0.2
4          Restaurant   0.2


----Humewood-Cedarvale----
          venue  freq
0  Hockey Arena   0.2
1         Trail   0.2
2  Tennis Court   0.2
3       Dog Run   0.2
4         Field   0.2


----Runnymede, The Junction North----
               venue  freq
0            Brewery  0.25
1           Bus Line  0.25
2  Convenience Store  0.25
3     Breakfast Spot  0.25
4                Bar  0.00


----Weston----
                venue  freq
0                Park   1.0
1                 Bar   0.0
2  Turkish Restaurant   0.0
3               Trail   0.0
4        Tennis Court   0.0




In [40]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Displaying the top 10 venues for each neighborhood.

In [41]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = york_grouped['Neighborhood']

for ind in np.arange(york_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(york_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Caledonia-Fairbanks,Park,Women's Store,Pool,Dog Run,Breakfast Spot,Brewery,Bus Line,Convenience Store,Discount Store,Hockey Arena
1,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",Bar,Sandwich Place,Restaurant,Turkish Restaurant,Discount Store,Dog Run,Breakfast Spot,Brewery,Bus Line,Convenience Store
2,Humewood-Cedarvale,Hockey Arena,Dog Run,Trail,Tennis Court,Field,Breakfast Spot,Brewery,Bus Line,Convenience Store,Discount Store
3,"Runnymede, The Junction North",Breakfast Spot,Brewery,Bus Line,Convenience Store,Women's Store,Field,Discount Store,Dog Run,Hockey Arena,Turkish Restaurant
4,Weston,Park,Women's Store,Field,Breakfast Spot,Brewery,Bus Line,Convenience Store,Discount Store,Dog Run,Hockey Arena


### 4. Clustering Neighborhood

In [42]:
kclusters = 5

york_grouped_clustering = york_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(york_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 4, 1, 2, 0], dtype=int32)

### A new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [43]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

york_merged = york_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
york_merged = york_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

york_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Postcode,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,York,Humewood-Cedarvale,M6C,43.693781,-79.428191,1,Hockey Arena,Dog Run,Trail,Tennis Court,Field,Breakfast Spot,Brewery,Bus Line,Convenience Store,Discount Store
1,York,Caledonia-Fairbanks,M6E,43.689026,-79.453512,3,Park,Women's Store,Pool,Dog Run,Breakfast Spot,Brewery,Bus Line,Convenience Store,Discount Store,Hockey Arena
2,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",M6M,43.691116,-79.476013,4,Bar,Sandwich Place,Restaurant,Turkish Restaurant,Discount Store,Dog Run,Breakfast Spot,Brewery,Bus Line,Convenience Store
3,York,"Runnymede, The Junction North",M6N,43.673185,-79.487262,2,Breakfast Spot,Brewery,Bus Line,Convenience Store,Women's Store,Field,Discount Store,Dog Run,Hockey Arena,Turkish Restaurant
4,York,Weston,M9N,43.706876,-79.518188,0,Park,Women's Store,Field,Breakfast Spot,Brewery,Bus Line,Convenience Store,Discount Store,Dog Run,Hockey Arena


## Visualizing the resulting clusters about neighborhood in York

In [44]:
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

In [45]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(york_merged['Latitude'], york_merged['Longitude'], york_merged['Neighborhood'], york_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Analizing each cluster

# I examined each cluster and determined the discriminating venue categories that distinguish each cluster. Based on the defining categories, I assigned a name to each cluster.

### Cluster 1: "Picnic area"

In [46]:
york_merged.loc[york_merged['Cluster Labels'] == 0, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Weston,0,Park,Women's Store,Field,Breakfast Spot,Brewery,Bus Line,Convenience Store,Discount Store,Dog Run,Hockey Arena


#### Cluster 1 is named "Picnic area" because it is a good option for a picnic day. You can find a park, a bus line and places to buy things to take away.

### Cluster 2: "Sport"

In [47]:
york_merged.loc[york_merged['Cluster Labels'] == 1, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Humewood-Cedarvale,1,Hockey Arena,Dog Run,Trail,Tennis Court,Field,Breakfast Spot,Brewery,Bus Line,Convenience Store,Discount Store


### Cluster 2 is named sport because it is perfect to watch a game, in person or in a big screen.

### Cluster 3: "Good option"


In [48]:
york_merged.loc[york_merged['Cluster Labels'] == 2, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,"Runnymede, The Junction North",2,Breakfast Spot,Brewery,Bus Line,Convenience Store,Women's Store,Field,Discount Store,Dog Run,Hockey Arena,Turkish Restaurant


### The cluster 3, named "Good option" shows a good match to go on weekends, because it has things for everyone.

### Cluster 4 : "Relaxing"

In [49]:
york_merged.loc[york_merged['Cluster Labels'] == 3, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Caledonia-Fairbanks,3,Park,Women's Store,Pool,Dog Run,Breakfast Spot,Brewery,Bus Line,Convenience Store,Discount Store,Hockey Arena


### The cluster 4, named "Relaxing" has places where you can have a good time buying things, watching a game, taking fresh air, etc.

### Cluster 5 : The Funniest Venues

In [50]:
york_merged.loc[york_merged['Cluster Labels'] == 4, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",4,Bar,Sandwich Place,Restaurant,Turkish Restaurant,Discount Store,Dog Run,Breakfast Spot,Brewery,Bus Line,Convenience Store


### The Cluster 5, named "The Funniest Venues" has a wide variety of places to eat and drink.