# Applied Data Science Capstone
## Neighborhood Data Clustering & Segmenting
### Part of IBM Data Science Professional Specialization on Coursera

## Part 1: Data Scraping & Loading & Cleaning

### Installing Libraries

In [2]:
# We will use Pandas library to extract data from a wikipedia page and convert it to a dataframe for our usage
# Importing necessary libraries

import pandas as pd
!pip install lxml
!pip install html5lib



### Data Extraction from Wikipedia

In [3]:
# Extracting data from wiki page.
# Source for code (Tutorial): 
# https://github.com/softhints/python/blob/master/notebooks/Scrape%20wiki%20tables%20with%20pandas%20and%20python.ipynb

from pandas.io.html import read_html
webpage = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

wikitables = read_html(webpage,  attrs={"class":"wikitable"})

print ("Extracted {num} wikitables".format(num=len(wikitables)))

Extracted 1 wikitables


In [4]:
wikitables[0]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


### Pandas Dataframe & Data Cleaning

In [5]:
# Converting extracted data to Pandas dataframe and reading first 5 rows
wiki_df = pd.DataFrame(wikitables[0])
wiki_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [6]:
# Renaming Column Postcode to PostalCode & Neighbourhood to Neighborhood
wiki_df.columns.values[0] = 'PostalCode'
wiki_df.columns.values[2] = 'Neighborhood'
wiki_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [7]:
# For our assignment we will not be looking at "Not Assigned" neighborhood. Therefore we will drop them

wiki_df = wiki_df[wiki_df.Borough !='Not assigned']
wiki_df

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [8]:
# We will now look at the Neighborhood column and replace "Not assigned" neighborhoods with their Borough names
# Further we will replace all NaN values in Neighborhood column with their Borough names

wiki_df.Neighbourhood.replace("Not assigned", wiki_df.Borough, inplace=True)
wiki_df.Neighbourhood.fillna(wiki_df.Borough, inplace=True)
wiki_df.reset_index(drop = True)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


### Grouping Data Based on Postcode

In [9]:
# We now group all the Neighborhoods sharing the same postal code together, using groupby, aggregate and string join/split

wiki_df2 = pd.DataFrame(wiki_df.groupby('Postcode', as_index=False).agg({'Borough': ', ' .join ,'Neighbourhood':', '.join}))
wiki_df2['Borough'] = wiki_df2['Borough'].str.split(',').str[0]
wiki_df2


  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [10]:
wiki_df2.shape

(103, 3)

## Part 2: Getting Location Data

### Importing Libraries

In [11]:
!pip install geopy 
!pip install geocoder
import geocoder # import geocoder
from geopy.geocoders import Nominatim



In [12]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
from sklearn.datasets.samples_generator import make_blobs

!pip install folium
import folium # map rendering library

from bs4 import BeautifulSoup
import lxml
print('Libraries imported.')

Libraries imported.


### Download Longitute & Latitute data for Geocoder Package

In [13]:
# Loading data
lat_data = pd.read_csv("https://cocl.us/Geospatial_data")
lat_data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [14]:
# Merging dataframe. Renamed column name to avoid errors in merging
lat_data.rename(columns={'Postal Code':'Postcode'}, inplace=True)
wiki_df_lat = pd.merge(wiki_df2, lat_data)
wiki_df_lat

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


### Getting Coordinates for Toronto

In [15]:
# Geopy sometimes times out so we try and print error in case it does not fetch coordinates
from geopy.exc import GeocoderTimedOut

my_address = 'Toronto, ON'

geolocator = Nominatim()
try:
    location = geolocator.geocode(my_address)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinate of Toronto, ON are {}, {}.'.format(latitude, longitude))
except GeocoderTimedOut as e:
    print("Error: geocode failed on input %s "%(my_address))

  


The geograpical coordinate of Toronto, ON are 43.653963, -79.387207.


### Creating map of Toronto with Folium, using coordinates obtained above

In [16]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(wiki_df_lat['Latitude'], wiki_df_lat['Longitude'], wiki_df_lat['Borough'], wiki_df_lat['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Creating a separate Dataframe for evaluating Borough that include "Toronto" keyword only

In [17]:
# We first identify Boroughs in Toronto
boroughs = wiki_df_lat['Borough'].unique()
boroughs

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke', "Queen's Park"], dtype=object)

In [18]:
# Next we include all the Boroughs that contain string Toronto in new dataframe
toronto_data = wiki_df_lat[(wiki_df_lat['Borough'] == 'East Toronto') | 
                           (wiki_df_lat['Borough'] == 'Central Toronto') | 
                           (wiki_df_lat['Borough'] == 'Downtown Toronto') | 
                           (wiki_df_lat['Borough'] == 'West Toronto')].reset_index(drop=True)
toronto_data

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [19]:
toronto_data.shape

(39, 5)

### Creating a map with data obtained above for the 39 Neighborhoods identified

In [20]:
# create map of Toronto using latitude and longitude values
map_toronto_2 = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_2)  
    
map_toronto_2

## Part 3: Use Foursquare API for Segmentation of Neighborhoods

### Getting Credentials for requests on API

In [21]:
CLIENT_ID = '0TBHOPQ5MAGXVJ2ONXUEDNPORUV2PCZ0UXO0LFDXT44U42YI' # your Foursquare ID
CLIENT_SECRET = 'KNPLEKZX420OQDB3I2YGQOYAGI4MO3QIP1IUH55OFJCQVUBN' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 0TBHOPQ5MAGXVJ2ONXUEDNPORUV2PCZ0UXO0LFDXT44U42YI
CLIENT_SECRET:KNPLEKZX420OQDB3I2YGQOYAGI4MO3QIP1IUH55OFJCQVUBN


### Defining a function to fetch nearby venues from Foursquare API 

In [22]:
LIMIT = 100
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]

    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Creating new dataframe for fetched venues

In [23]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

print(toronto_venues.shape)
toronto_venues.head()

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The Junction Sout

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Glen Stewart Park,43.675278,-79.294647,Park
3,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
4,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub


In [24]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,56,56,56,56,56,56
"Brockton, Exhibition Place, Parkdale Village",24,24,24,24,24,24
Business Reply Mail Processing Centre 969 Eastern,18,18,18,18,18,18
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",17,17,17,17,17,17
"Cabbagetown, St. James Town",44,44,44,44,44,44
Central Bay Street,83,83,83,83,83,83
"Chinatown, Grange Park, Kensington Market",84,84,84,84,84,84
Christie,18,18,18,18,18,18
Church and Wellesley,85,85,85,85,85,85


In [25]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 232 uniques categories.


## Part 4: Analyzing Neighborhoods

### One-Hot Encoding

In [100]:
# Initializing a new dataframe for one-hot encoding
toronto_onehot = pd.DataFrame(toronto_onehot)

# Adding Neighborhoods column to the new dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighborhood']

# One-hot encoding
toronto_dummies = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot = pd.concat([toronto_onehot, toronto_dummies], axis=1, sort=False)
toronto_onehot

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
# Grouping neighborhoods
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.058824,0.058824,0.058824,0.117647,0.176471,0.117647,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,0.0,...,0.0,0.0,0.012048,0.0,0.012048,0.0,0.012048,0.0,0.0,0.012048
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.047619,0.0,0.0,0.059524,0.011905,0.0,0.0,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.011765,0.0,0.0,0.0,0.0,0.0,0.0,0.011765,0.0,...,0.0,0.0,0.0,0.0,0.0,0.011765,0.0,0.011765,0.0,0.011765


In [102]:
# Forgot to import Numpy library
import numpy as np

# Function will return most common occurances of venues in each neighborhood
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [105]:
# Sorting and displaying top 10 venues per each neighborhood

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Coffee Shop,Bar,Café,Café,Bar,Asian Restaurant,Cosmetics Shop,Asian Restaurant,Steakhouse
1,Berczy Park,Coffee Shop,Coffee Shop,Cocktail Bar,Cocktail Bar,Beer Bar,Seafood Restaurant,Beer Bar,Steakhouse,Bakery,Seafood Restaurant
2,"Brockton, Exhibition Place, Parkdale Village",Café,Café,Yoga Studio,Coffee Shop,Breakfast Spot,Coffee Shop,Breakfast Spot,Yoga Studio,Grocery Store,Italian Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Light Rail Station,Yoga Studio,Spa,Pizza Place,Park,Gym / Fitness Center,Garden Center,Garden,Fast Food Restaurant
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Service,Airport Lounge,Airport Lounge,Airport Terminal,Airport Terminal,Boat or Ferry,Boat or Ferry,Bar,Plane
5,"Cabbagetown, St. James Town",Coffee Shop,Coffee Shop,Chinese Restaurant,Restaurant,Pub,Italian Restaurant,Italian Restaurant,Pizza Place,Café,Bakery
6,Central Bay Street,Coffee Shop,Coffee Shop,Italian Restaurant,Italian Restaurant,Sandwich Place,Sandwich Place,Burger Joint,Café,Juice Bar,Japanese Restaurant
7,"Chinatown, Grange Park, Kensington Market",Café,Café,Vietnamese Restaurant,Vietnamese Restaurant,Coffee Shop,Coffee Shop,Chinese Restaurant,Dumpling Restaurant,Vegetarian / Vegan Restaurant,Dumpling Restaurant
8,Christie,Café,Grocery Store,Café,Grocery Store,Park,Park,Baby Store,Italian Restaurant,Athletics & Sports,Coffee Shop
9,Church and Wellesley,Coffee Shop,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Sushi Restaurant,Japanese Restaurant,Gay Bar,Restaurant,Restaurant,Gay Bar


### Clustering Neighborhoods using K - Means

In [112]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### Adding two dataframes together: Toronto main data and Common Venues in neighborhoods

In [118]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Park,Trail,Park,Neighborhood,Neighborhood,Pub,Pub,Other Great Outdoors,Health Food Store,Other Great Outdoors
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Greek Restaurant,Coffee Shop,Coffee Shop,Ice Cream Shop,Italian Restaurant,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Furniture / Home Store
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Sandwich Place,Sandwich Place,Park,Fish & Chips Shop,Liquor Store,Liquor Store,Italian Restaurant,Food & Drink Shop,Burrito Place,Ice Cream Shop
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Café,Coffee Shop,Coffee Shop,American Restaurant,Italian Restaurant,Gastropub,Gastropub,Brewery,Italian Restaurant
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4,Park,Park,Bus Line,Swim School,Bus Line,Swim School,Yoga Studio,Modern European Restaurant,Market,Martial Arts Dojo


### Creating a map of Clustered data

In [120]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Cluster Data

In [121]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,0,Park,Trail,Park,Neighborhood,Neighborhood,Pub,Pub,Other Great Outdoors,Health Food Store,Other Great Outdoors
1,East Toronto,0,Greek Restaurant,Greek Restaurant,Coffee Shop,Coffee Shop,Ice Cream Shop,Italian Restaurant,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Furniture / Home Store
2,East Toronto,0,Sandwich Place,Sandwich Place,Park,Fish & Chips Shop,Liquor Store,Liquor Store,Italian Restaurant,Food & Drink Shop,Burrito Place,Ice Cream Shop
3,East Toronto,0,Café,Café,Coffee Shop,Coffee Shop,American Restaurant,Italian Restaurant,Gastropub,Gastropub,Brewery,Italian Restaurant
5,Central Toronto,0,Hotel,Hotel,Department Store,Dog Run,Sandwich Place,Breakfast Spot,Sandwich Place,Food & Drink Shop,Department Store,Dog Run
6,Central Toronto,0,Sporting Goods Shop,Sporting Goods Shop,Coffee Shop,Coffee Shop,Yoga Studio,Rental Car Location,Park,Mexican Restaurant,Gift Shop,Diner
7,Central Toronto,0,Sandwich Place,Pizza Place,Dessert Shop,Sandwich Place,Dessert Shop,Pizza Place,Gym,Sushi Restaurant,Café,Italian Restaurant
9,Central Toronto,0,Coffee Shop,Coffee Shop,Pub,Pub,Light Rail Station,Sushi Restaurant,Supermarket,Sports Bar,Sports Bar,Fried Chicken Joint
11,Downtown Toronto,0,Coffee Shop,Coffee Shop,Chinese Restaurant,Restaurant,Pub,Italian Restaurant,Italian Restaurant,Pizza Place,Café,Bakery
12,Downtown Toronto,0,Coffee Shop,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Sushi Restaurant,Japanese Restaurant,Gay Bar,Restaurant,Restaurant,Gay Bar


#### From the cluster data above, we can identify key venue that differentiates this cluster are cafe shops and restaurants

In [122]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Central Toronto,1,Playground,Playground,Yoga Studio,Movie Theater,Market,Martial Arts Dojo,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant


#### From the cluster data above, we can identify key venue that differentiates this cluster to be Playgrounds

In [123]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Central Toronto,2,Garden,Garden,Yoga Studio,Movie Theater,Lounge,Market,Martial Arts Dojo,Mediterranean Restaurant,Men's Store,Metro Station


#### From the cluster data above, we can identify key venue that differentiates this cluster to be Gardens

In [125]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Downtown Toronto,3,Park,Park,Playground,Playground,Trail,Trail,Martial Arts Dojo,Market,Lounge,Monument / Landmark
23,Central Toronto,3,Park,Jewelry Store,Sushi Restaurant,Sushi Restaurant,Trail,Trail,Jewelry Store,Park,Market,Martial Arts Dojo


#### From the cluster data above, we can identify key venue that differentiates this cluster to be Parks and Playgrounds

In [126]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Central Toronto,4,Park,Park,Bus Line,Swim School,Bus Line,Swim School,Yoga Studio,Modern European Restaurant,Market,Martial Arts Dojo


#### From the cluster data above, we can identify key venue that differentiates this cluster to be Parks

### This marks the end of assignment.