In [2]:
# Installing the scraping library
#!conda install -c anaconda beautifulsoup4 -y
# Installing the LXML, HTML5lib parsers and the request lib. 
#!conda install -c conda-forge lxml --y
#!conda install -c anaconda html5lib beautifulsoup4 --y
#!conda install -c anaconda requests --y

In [3]:
import requests
import pandas as pd

##### to read data from a html on our desktop

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

data = pd.read_html(url)
# print the number of tables structures in the html doc
len(data)

3

In [5]:
# Grabbing the first table
data[0]

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned


In [6]:
# Copying the the html table into a dataframe
df = pd.DataFrame(data[0])

In [7]:
df.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [8]:
df.columns = df.iloc[0] # Setting row [0] as the header
df = df[1:]             # Setting row[1] as the first data row

In [9]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


**Now, let's replace "Not assigned" in the neighborhood column with the name of is borough and reset the index to be the Postcode column**

In [10]:
# Replacing the 'Not assigned' values with the name of the borough
df = df[df['Borough'] != 'Not assigned'].reset_index()
df.drop(axis=1, columns= 'index', inplace=True) # Resetting the index
df.head(7)   # Sanity checking

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned


##### Assigning the borough'name to the neighbouhood if it is in "Not Assigned" condition

In [11]:
df.loc[df['Neighbourhood']=='Not assigned','Neighbourhood']=df['Borough'][df['Neighbourhood']=='Not assigned']

In [12]:
df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


##### Grouping Neighborhood of the same borough

In [13]:
tbl = df.groupby(['Postcode','Borough'])['Neighbourhood']\
                         .apply(list)\
                         .reset_index()

# list to string (separated by commas) 
tbl['Neighbourhood'] = tbl.apply(lambda x: (','.join([str(s) for s in x['Neighbourhood']])), axis = 1)

# Resetting the index and shuffling the data
data = tbl.sample(frac=1).reset_index(drop=True)

In [14]:
data.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M4A,North York,Victoria Village
1,M5L,Downtown Toronto,"Commerce Court,Victoria Hotel"
2,M2R,North York,Willowdale West
3,M3C,North York,"Flemingdon Park,Don Mills South"
4,M2M,North York,"Newtonbrook,Willowdale"
5,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf..."
6,M9C,Etobicoke,"Bloordale Gardens,Eringate,Markland Wood,Old B..."
7,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw..."
8,M4N,Central Toronto,Lawrence Park
9,M6J,West Toronto,"Little Portugal,Trinity"


##### Let's print the shape of the final dataframe

In [15]:
data.shape

(103, 3)

#### Collecting lat&long for each postal code


In [17]:
postl_codes = pd.read_csv('Geospatial_Coordinates.csv')
postl_codes.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M4A,North York,Victoria Village
1,M5L,Downtown Toronto,"Commerce Court,Victoria Hotel"
2,M2R,North York,Willowdale West
3,M3C,North York,"Flemingdon Park,Don Mills South"
4,M2M,North York,"Newtonbrook,Willowdale"


The **'postl_codes'** dataframe contains the lat and long of each postal codes. 

Whereas the **'data'** contains the postal codes along with the the corresponding borough and all the neighborhoods.

Let's now fille add lat and long data to the "data" dataframe

In [19]:
Latitude = []
Longitude = []

for index,row in data.iterrows():
    for ind,ro in postl_codes.iterrows():
        if row['Postcode']  == ro['Postal Code']:
            Latitude.append(postl_codes.iloc[ind]['Latitude'])
            
            
for index1,row1 in data.iterrows():
    for ind1,ro1 in postl_codes.iterrows():
        if row1['Postcode']  == ro1['Postal Code']:
            Longitude.append(postl_codes.iloc[ind1]['Longitude'])


data['Latitude']=Latitude
data['Longitude']=Longitude

In [20]:
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4A,North York,Victoria Village,43.725882,-79.315572
1,M5L,Downtown Toronto,"Commerce Court,Victoria Hotel",43.648198,-79.379817
2,M2R,North York,Willowdale West,43.782736,-79.442259
3,M3C,North York,"Flemingdon Park,Don Mills South",43.7259,-79.340923
4,M2M,North York,"Newtonbrook,Willowdale",43.789053,-79.408493


In [21]:
# Let's Check if everything went well
data[data['Postcode']=='M5G']

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
95,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


Perfect!!!!!!!!!!

#### Let's explore the boroughs
But, let's first see which borough has the highest number of counts

In [29]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium # map rendering library
import json # library to handle JSON files
# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [25]:
data['Borough'].value_counts()

North York          24
Downtown Toronto    18
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
York                 5
East Toronto         5
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

We can see that North York has highest number of records so let's explore it by subsetting the dataframe

In [27]:
North_York_data = data[data['Borough']=='North York'].reset_index(drop=True)
North_York_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4A,North York,Victoria Village,43.725882,-79.315572
1,M2R,North York,Willowdale West,43.782736,-79.442259
2,M3C,North York,"Flemingdon Park,Don Mills South",43.7259,-79.340923
3,M2M,North York,"Newtonbrook,Willowdale",43.789053,-79.408493
4,M3B,North York,Don Mills North,43.745906,-79.352188


Let's get the geographical coordinates of North york

In [30]:
adress = 'North York'

geolocator = Nominatim(user_agent='ny_explorer')
location = geolocator.geocode(adress)
latitude = location.latitude
longitude =location.longitude

print('The geograpical coordinate of North york are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of North york are 43.7708175, -79.4132998.


Now we can visualize the neighborhoods in the North York Borough

In [32]:
# create map of Manhattan using latitude and longitude values
North_York = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(North_York_data['Latitude'], North_York_data['Longitude'], North_York_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(North_York)  
    
North_York

#### Now, using the Foursqaure API let's exploe each North_york's Neighborhood venues

#### Define Foursquare Credentials and Version

In [33]:
CLIENT_ID = 'LACCMHV5QUOKY2LP01BVEOGWHIZWQUTYD4JK1AB10SIQRM1U' # your Foursquare ID
CLIENT_SECRET = 'TUH0FDUN55NCZOEKOH0GBMSVYCWCQGFLFYRBTPKJYARKE10Q' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: LACCMHV5QUOKY2LP01BVEOGWHIZWQUTYD4JK1AB10SIQRM1U
CLIENT_SECRET:TUH0FDUN55NCZOEKOH0GBMSVYCWCQGFLFYRBTPKJYARKE10Q


We start by creating a function that will grab for each North York norghbourhood the location(Latitude,Longitude), the name of the venue, and it's category.

In [37]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue    
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [38]:
North_York_venues = getNearbyVenues(names=North_York_data['Neighbourhood'],
                                   latitudes=North_York_data['Latitude'],
                                   longitudes=North_York_data['Longitude'])

Victoria Village
Willowdale West
Flemingdon Park,Don Mills South
Newtonbrook,Willowdale
Don Mills North
Downsview,North Park,Upwood Park
Silver Hills,York Mills
Downsview Northwest
York Mills West
Emery,Humberlea
Parkwoods
Fairview,Henry Farm,Oriole
Downsview Central
Bedford Park,Lawrence Manor East
Hillcrest Village
Downsview West
Bayview Village
Bathurst Manor,Downsview North,Wilson Heights
CFB Toronto,Downsview East
Northwood Park,York University
Humber Summit
Willowdale South
Lawrence Heights,Lawrence Manor
Glencairn


Let's check the size of the resulting dataframe

In [39]:
print(North_York_venues.shape)
North_York_venues.head()

(245, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
1,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
2,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
3,Victoria Village,43.725882,-79.315572,The Frig,43.727051,-79.317418,French Restaurant
4,Victoria Village,43.725882,-79.315572,Eglinton Ave E & Sloane Ave/Bermondsey Rd,43.726086,-79.31362,Intersection


Let's check how many venues were returned for each neighborhood

In [40]:
North_York_venues.groupby('Neighborhood').count().head()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Manor,Downsview North,Wilson Heights",19,19,19,19,19,19
Bayview Village,4,4,4,4,4,4
"Bedford Park,Lawrence Manor East",24,24,24,24,24,24
"CFB Toronto,Downsview East",3,3,3,3,3,3
Don Mills North,6,6,6,6,6,6


Let's find out how many unique categories can be curated from all the returned venues

In [43]:
print('There are {} uniques categories.'.format(len(North_York_venues['Venue Category'].unique())))

There are 111 uniques categories.


#### Now it's time to closely Analyze Each Neighborhood

In [50]:
# One hot encoding
North_York_onehot = pd.get_dummies(North_York_venues[['Venue Category']], prefix="",prefix_sep="")

# add neighborhood column back to dataframe
North_York_onehot['Neighborhood'] = North_York_venues['Neighborhood']

# move neighborhood column to the first column

fixed_columns = [North_York_onehot.columns[-1]] + list(North_York_onehot.columns[:-1])
North_York_onehot = North_York_onehot[fixed_columns]



North_York_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,Bank,...,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store
0,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let'sexamine the new dataframe size

In [51]:
North_York_onehot.shape

(245, 112)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [54]:
North_York_grouped = North_York_onehot.groupby('Neighborhood').mean().reset_index()
North_York_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,Bank,...,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store
0,"Bathurst Manor,Downsview North,Wilson Heights",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,...,0.052632,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park,Lawrence Manor East",0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041667,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CFB Toronto,Downsview East",0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Don Mills North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Let's confirm the new size

In [56]:
North_York_grouped.shape

(23, 112)

#### Let's print each neighborhood along with the top 5 most common venues

In [57]:
num_top_venues = 5

for hood in North_York_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = North_York_grouped[North_York_grouped['Neighborhood']==hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')
    

----Bathurst Manor,Downsview North,Wilson Heights----
                       venue  freq
0                Coffee Shop  0.11
1                Supermarket  0.05
2                  Pet Store  0.05
3                   Pharmacy  0.05
4  Middle Eastern Restaurant  0.05


----Bayview Village----
                 venue  freq
0   Chinese Restaurant  0.25
1                 Café  0.25
2                 Bank  0.25
3  Japanese Restaurant  0.25
4    Accessories Store  0.00


----Bedford Park,Lawrence Manor East----
                     venue  freq
0              Coffee Shop  0.08
1       Italian Restaurant  0.08
2  Comfort Food Restaurant  0.04
3           Breakfast Spot  0.04
4             Cupcake Shop  0.04


----CFB Toronto,Downsview East----
          venue  freq
0       Airport  0.33
1          Park  0.33
2      Bus Stop  0.33
3      Pharmacy  0.00
4  Optical Shop  0.00


----Don Mills North----
                  venue  freq
0                  Café  0.17
1        Baseball Field  0.17
2  Gym / F

Let's put that into a pandas dataframe

In [58]:
# First, let's write a function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [121]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = North_York_grouped['Neighborhood']

for ind in np.arange(North_York_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(North_York_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor,Downsview North,Wilson Heights",Coffee Shop,Restaurant,Supermarket,Pizza Place,Deli / Bodega,Pharmacy,Pet Store,Sandwich Place,Shopping Mall,Diner
1,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Women's Store,Diner,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop
2,"Bedford Park,Lawrence Manor East",Coffee Shop,Italian Restaurant,Café,Juice Bar,Pub,Pizza Place,Pharmacy,Butcher,Liquor Store,Comfort Food Restaurant
3,"CFB Toronto,Downsview East",Airport,Bus Stop,Park,Women's Store,Dim Sum Restaurant,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store
4,Don Mills North,Japanese Restaurant,Caribbean Restaurant,Gym / Fitness Center,Café,Baseball Field,Basketball Court,Women's Store,Concert Hall,Construction & Landscaping,Convenience Store


## 4. Cluster Neighborhoods
Run k-means to cluster the neighborhood into 5 clusters.

In [122]:
# set number of clusters
kclusters = 5

North_York_grouped_clustering = North_York_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(North_York_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 4, 0, 3, 0, 0, 0, 2])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [123]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [124]:
# Let's see if the column conatining the label for each row was sussessfully addes
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,"Bathurst Manor,Downsview North,Wilson Heights",Coffee Shop,Restaurant,Supermarket,Pizza Place,Deli / Bodega,Pharmacy,Pet Store,Sandwich Place,Shopping Mall,Diner
1,0,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Women's Store,Diner,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop
2,0,"Bedford Park,Lawrence Manor East",Coffee Shop,Italian Restaurant,Café,Juice Bar,Pub,Pizza Place,Pharmacy,Butcher,Liquor Store,Comfort Food Restaurant
3,4,"CFB Toronto,Downsview East",Airport,Bus Stop,Park,Women's Store,Dim Sum Restaurant,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store
4,0,Don Mills North,Japanese Restaurant,Caribbean Restaurant,Gym / Fitness Center,Café,Baseball Field,Basketball Court,Women's Store,Concert Hall,Construction & Landscaping,Convenience Store


In [125]:
North_York_merged = North_York_data
# Let's rename or correct 'Neighbourhood' column name in North_York_merge
# To match that in neighborhoods_venues_sorted
North_York_merged.columns=['Postcode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude']
North_York_merged.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4A,North York,Victoria Village,43.725882,-79.315572
1,M2R,North York,Willowdale West,43.782736,-79.442259
2,M3C,North York,"Flemingdon Park,Don Mills South",43.7259,-79.340923
3,M2M,North York,"Newtonbrook,Willowdale",43.789053,-79.408493
4,M3B,North York,Don Mills North,43.745906,-79.352188


In [126]:
# let's now merge(join) this dataframe with the venues dataframe to add lat and long for each neighborhood
North_York_merged = North_York_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

In [127]:
# # check the last columns!
North_York_merged.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,Intersection,French Restaurant,Coffee Shop,Portuguese Restaurant,Hockey Arena,Dim Sum Restaurant,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store
1,M2R,North York,Willowdale West,43.782736,-79.442259,0.0,Coffee Shop,Pharmacy,Grocery Store,Pizza Place,Butcher,Discount Store,Women's Store,Dessert Shop,Comfort Food Restaurant,Concert Hall
2,M3C,North York,"Flemingdon Park,Don Mills South",43.7259,-79.340923,0.0,Asian Restaurant,Gym,Coffee Shop,Beer Store,Bike Shop,Discount Store,Chinese Restaurant,Concert Hall,Japanese Restaurant,Italian Restaurant
3,M2M,North York,"Newtonbrook,Willowdale",43.789053,-79.408493,1.0,Piano Bar,Women's Store,Chocolate Shop,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cupcake Shop
4,M3B,North York,Don Mills North,43.745906,-79.352188,0.0,Japanese Restaurant,Caribbean Restaurant,Gym / Fitness Center,Café,Baseball Field,Basketball Court,Women's Store,Concert Hall,Construction & Landscaping,Convenience Store
5,M6L,North York,"Downsview,North Park,Upwood Park",43.713756,-79.490074,0.0,Construction & Landscaping,Bakery,Park,Women's Store,Diner,Coffee Shop,Comfort Food Restaurant,Concert Hall,Convenience Store,Cosmetics Shop
6,M2L,North York,"Silver Hills,York Mills",43.75749,-79.374714,,,,,,,,,,,
7,M3N,North York,Downsview Northwest,43.761631,-79.520999,0.0,Grocery Store,Athletics & Sports,Gym / Fitness Center,Liquor Store,Discount Store,Women's Store,Dim Sum Restaurant,Comfort Food Restaurant,Concert Hall,Construction & Landscaping
8,M2P,North York,York Mills West,43.752758,-79.400049,0.0,Convenience Store,Bank,Electronics Store,Park,Women's Store,Dim Sum Restaurant,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping
9,M9M,North York,"Emery,Humberlea",43.724766,-79.532242,2.0,Baseball Field,Women's Store,Diner,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cupcake Shop


From the above table we can see that data at index 6 is missing many entries let's drop that

In [129]:
North_York_merged.drop(axis=0,index=6, inplace=True)
North_York_merged.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,Intersection,French Restaurant,Coffee Shop,Portuguese Restaurant,Hockey Arena,Dim Sum Restaurant,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store
1,M2R,North York,Willowdale West,43.782736,-79.442259,0.0,Coffee Shop,Pharmacy,Grocery Store,Pizza Place,Butcher,Discount Store,Women's Store,Dessert Shop,Comfort Food Restaurant,Concert Hall
2,M3C,North York,"Flemingdon Park,Don Mills South",43.7259,-79.340923,0.0,Asian Restaurant,Gym,Coffee Shop,Beer Store,Bike Shop,Discount Store,Chinese Restaurant,Concert Hall,Japanese Restaurant,Italian Restaurant
3,M2M,North York,"Newtonbrook,Willowdale",43.789053,-79.408493,1.0,Piano Bar,Women's Store,Chocolate Shop,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cupcake Shop
4,M3B,North York,Don Mills North,43.745906,-79.352188,0.0,Japanese Restaurant,Caribbean Restaurant,Gym / Fitness Center,Café,Baseball Field,Basketball Court,Women's Store,Concert Hall,Construction & Landscaping,Convenience Store
5,M6L,North York,"Downsview,North Park,Upwood Park",43.713756,-79.490074,0.0,Construction & Landscaping,Bakery,Park,Women's Store,Diner,Coffee Shop,Comfort Food Restaurant,Concert Hall,Convenience Store,Cosmetics Shop
7,M3N,North York,Downsview Northwest,43.761631,-79.520999,0.0,Grocery Store,Athletics & Sports,Gym / Fitness Center,Liquor Store,Discount Store,Women's Store,Dim Sum Restaurant,Comfort Food Restaurant,Concert Hall,Construction & Landscaping
8,M2P,North York,York Mills West,43.752758,-79.400049,0.0,Convenience Store,Bank,Electronics Store,Park,Women's Store,Dim Sum Restaurant,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping
9,M9M,North York,"Emery,Humberlea",43.724766,-79.532242,2.0,Baseball Field,Women's Store,Diner,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cupcake Shop
10,M3A,North York,Parkwoods,43.753259,-79.329656,4.0,Food & Drink Shop,BBQ Joint,Bus Stop,Park,Women's Store,Dim Sum Restaurant,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping


Finally, let's visualize the resulting clusters

In [130]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
North_York_merged['Cluster Labels'] =North_York_merged['Cluster Labels'].astype(int)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]



# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(North_York_merged['Latitude'], North_York_merged['Longitude'], North_York_merged['Neighborhood'], North_York_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Examine Clusters

Now, we can examine each cluster and determine the discriminating venue catefories that distinguish each cluster. Based on the defining categories , we can tehn assign a name to each cluster.

In [132]:
#### Cluster 1
North_York_merged.loc[North_York_merged['Cluster Labels'] == 0, North_York_merged.columns[[1] + list(range(5, North_York_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,0,Intersection,French Restaurant,Coffee Shop,Portuguese Restaurant,Hockey Arena,Dim Sum Restaurant,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store
1,North York,0,Coffee Shop,Pharmacy,Grocery Store,Pizza Place,Butcher,Discount Store,Women's Store,Dessert Shop,Comfort Food Restaurant,Concert Hall
2,North York,0,Asian Restaurant,Gym,Coffee Shop,Beer Store,Bike Shop,Discount Store,Chinese Restaurant,Concert Hall,Japanese Restaurant,Italian Restaurant
4,North York,0,Japanese Restaurant,Caribbean Restaurant,Gym / Fitness Center,Café,Baseball Field,Basketball Court,Women's Store,Concert Hall,Construction & Landscaping,Convenience Store
5,North York,0,Construction & Landscaping,Bakery,Park,Women's Store,Diner,Coffee Shop,Comfort Food Restaurant,Concert Hall,Convenience Store,Cosmetics Shop
7,North York,0,Grocery Store,Athletics & Sports,Gym / Fitness Center,Liquor Store,Discount Store,Women's Store,Dim Sum Restaurant,Comfort Food Restaurant,Concert Hall,Construction & Landscaping
8,North York,0,Convenience Store,Bank,Electronics Store,Park,Women's Store,Dim Sum Restaurant,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping
11,North York,0,Clothing Store,Coffee Shop,Fast Food Restaurant,Japanese Restaurant,Food Court,Bakery,Asian Restaurant,Women's Store,Toy / Game Store,Tea Room
13,North York,0,Coffee Shop,Italian Restaurant,Café,Juice Bar,Pub,Pizza Place,Pharmacy,Butcher,Liquor Store,Comfort Food Restaurant
14,North York,0,Golf Course,Pool,Mediterranean Restaurant,Dog Run,Women's Store,Dessert Shop,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping


In [133]:
#### Cluster 2
North_York_merged.loc[North_York_merged['Cluster Labels'] == 1, North_York_merged.columns[[1] + list(range(5, North_York_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,North York,1,Piano Bar,Women's Store,Chocolate Shop,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cupcake Shop


In [134]:
#### Cluster 3
North_York_merged.loc[North_York_merged['Cluster Labels'] == 2, North_York_merged.columns[[1] + list(range(5, North_York_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,North York,2,Baseball Field,Women's Store,Diner,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cupcake Shop


In [135]:
#### Cluster 4
North_York_merged.loc[North_York_merged['Cluster Labels'] == 3, North_York_merged.columns[[1] + list(range(5, North_York_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,North York,3,Food Truck,Baseball Field,Women's Store,Diner,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop


In [136]:
#### Cluster 5
North_York_merged.loc[North_York_merged['Cluster Labels'] == 4, North_York_merged.columns[[1] + list(range(5, North_York_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,North York,4,Food & Drink Shop,BBQ Joint,Bus Stop,Park,Women's Store,Dim Sum Restaurant,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping
18,North York,4,Airport,Bus Stop,Park,Women's Store,Dim Sum Restaurant,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store
