<a href="https://colab.research.google.com/github/MaguireMaName/Coursera_Capstone/blob/master/Machine_Learning_w_Python_Segment_%26_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#!pip install geocoder
#!pip install requests

In [0]:
# bring in dependencies 
import geocoder
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests as rq
import folium
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

## Machine Learning with Python: Dataframe of postal code, neighborhood, & borough
*For the Applied Data Science Capstone Project*

In [0]:
# define url for scraping and print

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
print(url)

https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M


In [0]:
response = rq.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

#print(soup)

In [0]:
table = soup.find('table', {'class':'wikitable sortable'}).tbody
#print(table)

rows = table.find_all('tr')

columns = [v.text.replace('\n','') for v in rows[0].find_all('th')]

df_a = pd.DataFrame(columns=columns)

for i in range(1, len(rows)):
    tds = rows[i].find_all('td')
    
    if len(tds) ==4:
      values = [tds[0].text, tds[1].text,'', tds[2].text, tds[3].text.replace('\n','').replace('\xa0','')]
    else:
      values = [td.text.replace('\n','').replace('\xa0','') for td in tds]
    
    df_a = df_a.append(pd.Series(values, index=columns), ignore_index=True)


In [0]:
# dimensions before aggregation

df_a.shape

(288, 3)

In [0]:
# aggregate data

df_b = df_a.groupby(['Postcode','Borough']).agg(lambda x: x.tolist()).reset_index()

In [0]:
df_b.shape

(180, 3)

In [0]:
# check results

df_b.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,[Not assigned]
1,M1B,Scarborough,"[Rouge, Malvern]"
2,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
3,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
4,M1G,Scarborough,[Woburn]


In [0]:
# where neighbourhood is not assigned, replace it with borough

df_b['Neighbourhood'] = np.where(df_b['Neighbourhood'] == "Not assigned", df_b['Borough'], df_b['Neighbourhood'])


In [0]:
# exception table

x_neighbourhood = df_b.loc[(df_b['Neighbourhood'] == "Not assigned")]
x_neighbourhood.shape

(0, 3)

In [0]:
# exception table

x_borough = df_b.loc[(df_b['Borough'] == "Not assigned")]
x_borough.shape

(77, 3)

In [0]:
# 77 boroughs not assigned. Don't process obs. where borough = 'Not assigned'

df_c = df_b.drop(df_b[df_b.Borough == "Not assigned"].index)


In [0]:
# exception table

x_borough = df_c.loc[(df_b['Borough'] == "Not assigned")]
x_borough.shape

(0, 3)

In [0]:
# dimensions after aggregation

df_c.shape

(103, 3)

In [0]:
# load in lat and lon info

df_geo = pd.read_csv('Geospatial_Coordinates.csv')

In [0]:
# check geo load

df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [0]:
df_geo.rename(columns = {"Postal Code": "Postcode"}, 
                                 inplace = True) 

In [0]:
# join geo with neighbourhood data

df_d = pd.merge(df_geo, df_c, on='Postcode')

In [0]:
# check dimensions

df_d.shape

(103, 5)

In [0]:
# check data

df_d.head()

Unnamed: 0,Postcode,Latitude,Longitude,Borough,Neighbourhood
0,M1B,43.806686,-79.194353,Scarborough,"[Rouge, Malvern]"
1,M1C,43.784535,-79.160497,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,43.763573,-79.188711,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,43.770992,-79.216917,Scarborough,[Woburn]
4,M1H,43.773136,-79.239476,Scarborough,[Cedarbrae]


In [0]:
# will explore features of neighbourhoods in York & Toronto

df_e = df_d[df_d['Borough'].str.contains("York|Toronto")].reset_index(drop=True)
df_e.head()

Unnamed: 0,Postcode,Latitude,Longitude,Borough,Neighbourhood
0,M2H,43.803762,-79.363452,North York,[Hillcrest Village]
1,M2J,43.778517,-79.346556,North York,"[Fairview, Henry Farm, Oriole]"
2,M2K,43.786947,-79.385975,North York,[Bayview Village]
3,M2L,43.75749,-79.374714,North York,"[Silver Hills, York Mills]"
4,M2M,43.789053,-79.408493,North York,"[Newtonbrook, Willowdale]"


In [0]:
# OPEN IN COLLAB TO VIEW MAP OUTPUTS

address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, label in zip(df_e['Latitude'], df_e['Longitude'], df_e['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='green',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [0]:
# define Foursquare Credentials and Version

client_id = 'KL5SVGOS40RKZBQK4G1VXYBKBICWCDQL2NMCASHFYER432SS' #  Foursquare ID'
client_secret = '1A5KPYJQIATH0SDZXPPZ5YK0SHLBYVEGPER5AAIIMDXLZ0AB' #  Foursquare Secret
version = '20180604'
limit = 30

In [0]:
# let's create a function to repeat the same process to all the neighborhoods in toronto

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [0]:
# run the above function on each neighborhood and create a new dataframe called toronto_venues

toronto_venues = getNearbyVenues(names=df_e['Neighbourhood'],
                                   latitudes=df_e['Latitude'],
                                   longitudes=df_e['Longitude']
                                  )

['Hillcrest Village']
['Fairview', 'Henry Farm', 'Oriole']
['Bayview Village']
['Silver Hills', 'York Mills']
['Newtonbrook', 'Willowdale']
['Willowdale South']
['York Mills West']
['Willowdale West']
['Parkwoods']
['Don Mills North']
['Flemingdon Park', 'Don Mills South']
['Bathurst Manor', 'Downsview North', 'Wilson Heights']
['Northwood Park', 'York University']
['CFB Toronto', 'Downsview East']
['Downsview West']
['Downsview Central']
['Downsview Northwest']
['Victoria Village']
['Woodbine Gardens', 'Parkview Hill']
['Woodbine Heights']
['The Beaches']
['Leaside']
['Thorncliffe Park']
['East Toronto']
['The Danforth West', 'Riverdale']
['The Beaches West', 'India Bazaar']
['Studio District']
['Lawrence Park']
['Davisville North']
['North Toronto West']
['Davisville']
['Moore Park', 'Summerhill East']
['Deer Park', 'Forest Hill SE', 'Rathnelly', 'South Hill', 'Summerhill West']
['Rosedale']
['Cabbagetown', 'St. James Town']
['Church and Wellesley']
['Harbourfront', 'Regent Park']
['

In [0]:
# check dimensions and data

print(toronto_venues.shape)
toronto_venues.head()

(1121, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,[Hillcrest Village],43.803762,-79.363452,Eagle's Nest Golf Club,43.805455,-79.364186,Golf Course
1,[Hillcrest Village],43.803762,-79.363452,New York Fries,43.803664,-79.363905,Fast Food Restaurant
2,[Hillcrest Village],43.803762,-79.363452,AY Jackson Pool,43.804515,-79.366138,Pool
3,[Hillcrest Village],43.803762,-79.363452,Villa Madina,43.801685,-79.363938,Mediterranean Restaurant
4,[Hillcrest Village],43.803762,-79.363452,Duncan Creek Park,43.805539,-79.360695,Dog Run


In [0]:
# change data type before grouping and counting the number of venues

toronto_venues['Neighborhood'] = toronto_venues['Neighborhood'].apply(tuple)

In [0]:
# the number of venues returned for each neighborhood

toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(Adelaide, King, Richmond)",30,30,30,30,30,30
"(Bathurst Manor, Downsview North, Wilson Heights)",18,18,18,18,18,18
"(Bayview Village,)",4,4,4,4,4,4
"(Bedford Park, Lawrence Manor East)",25,25,25,25,25,25
"(Berczy Park,)",30,30,30,30,30,30
"(Brockton, Exhibition Place, Parkdale Village)",21,21,21,21,21,21
"(Business Reply Mail Processing Centre 969 Eastern,)",17,17,17,17,17,17
"(CFB Toronto, Downsview East)",2,2,2,2,2,2
"(CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara)",15,15,15,15,15,15
"(Cabbagetown, St. James Town)",30,30,30,30,30,30


In [0]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 220 uniques categories.


In [0]:
# analyse each neighbourhood

# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Workshop,BBQ Joint,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Basketball Court,Basketball Stadium,Beer Bar,Beer Store,Belgian Restaurant,Bike Shop,Bistro,Boat or Ferry,Bookstore,Boutique,Breakfast Spot,Brewery,Bridal Shop,Bubble Tea Shop,Burger Joint,...,Rock Climbing Spot,Salad Place,Salon / Barbershop,Sandwich Place,Sculpture Garden,Seafood Restaurant,Shopping Mall,Skate Park,Skating Rink,Smoke Shop,Smoothie Shop,Spa,Speakeasy,Sporting Goods Shop,Sports Bar,Stadium,Stationery Store,Steakhouse,Supermarket,Sushi Restaurant,Swim School,Taco Place,Tailor Shop,Taiwanese Restaurant,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
# check dimensions

toronto_onehot.shape

(1121, 220)

In [0]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Workshop,BBQ Joint,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Basketball Court,Basketball Stadium,Beer Bar,Beer Store,Belgian Restaurant,Bike Shop,Bistro,Boat or Ferry,Bookstore,Boutique,Breakfast Spot,Brewery,Bridal Shop,Bubble Tea Shop,...,Rock Climbing Spot,Salad Place,Salon / Barbershop,Sandwich Place,Sculpture Garden,Seafood Restaurant,Shopping Mall,Skate Park,Skating Rink,Smoke Shop,Smoothie Shop,Spa,Speakeasy,Sporting Goods Shop,Sports Bar,Stadium,Stationery Store,Steakhouse,Supermarket,Sushi Restaurant,Swim School,Taco Place,Tailor Shop,Taiwanese Restaurant,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,"(Adelaide, King, Richmond)",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.033333,0.0,0.0,0.000000,0.000000,0.000000,0.066667,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.033333,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033333,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.033333,0.000000,0.000000,0.0,0.033333,0.000000,0.000000,0.033333,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,0.033333,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033333,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,"(Bathurst Manor, Downsview North, Wilson Heights)",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.055556,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.055556,0.000000,...,0.000000,0.000000,0.000000,0.055556,0.000000,0.000000,0.055556,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.055556,0.055556,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.055556,0.000000,0.000000,0.000000,0.000000,0.000000
2,"(Bayview Village,)",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.250000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,"(Bedford Park, Lawrence Manor East)",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.040000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.040000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.040000,0.00,0.000000,0.000000,0.000000,0.000000,0.040000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,"(Berczy Park,)",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.033333,0.000000,0.000000,0.000000,0.00,0.033333,0.066667,0.000000,0.000000,0.000000,0.033333,0.000000,0.000000,0.000000,0.033333,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.066667,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033333,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.033333,0.033333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033333,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,"(Brockton, Exhibition Place, Parkdale Village)",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.047619,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.095238,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.047619,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,(Business Reply Mail Processing Centre 969 Eas...,0.058824,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.058824,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.058824,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.058824,0.0,0.058824,0.000000,0.058824,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,"(CFB Toronto, Downsview East)",0.000000,0.0,0.000000,0.500000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,"(CN Tower, Bathurst Quay, Island airport, Harb...",0.000000,0.0,0.000000,0.066667,0.066667,0.066667,0.133333,0.2,0.133333,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.066667,0.000000,0.066667,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.066667,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,"(Cabbagetown, St. James Town)",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.066667,0.033333,0.000000,0.000000,0.00,0.000000,0.000000,0.033333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.033333,0.000000,0.033333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [0]:
# check dimensions

toronto_grouped.shape

(71, 220)

In [0]:
# changing data type to enable freq counts

toronto_grouped['Neighborhood'] = toronto_grouped['Neighborhood'].astype(str)

In [0]:
# top 5 frequencies

num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----('Adelaide', 'King', 'Richmond')----
              venue  freq
0        Steakhouse  0.10
1  Asian Restaurant  0.07
2              Café  0.07
3       Pizza Place  0.07
4             Hotel  0.07


----('Bathurst Manor', 'Downsview North', 'Wilson Heights')----
                  venue  freq
0           Coffee Shop  0.11
1         Deli / Bodega  0.06
2  Fast Food Restaurant  0.06
3           Bridal Shop  0.06
4        Sandwich Place  0.06


----('Bayview Village',)----
                 venue  freq
0   Chinese Restaurant  0.25
1                 Bank  0.25
2  Japanese Restaurant  0.25
3                 Café  0.25
4          Yoga Studio  0.00


----('Bedford Park', 'Lawrence Manor East')----
                  venue  freq
0           Coffee Shop  0.08
1    Italian Restaurant  0.08
2           Pizza Place  0.08
3             Juice Bar  0.08
4  Fast Food Restaurant  0.08


----('Berczy Park',)----
                venue  freq
0  Seafood Restaurant  0.07
1                Café  0.07
2          

In [0]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [0]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"('Adelaide', 'King', 'Richmond')",Steakhouse,Café,Hotel,Pizza Place,Asian Restaurant,Gym / Fitness Center,Breakfast Spot,Seafood Restaurant,Smoke Shop,Lounge
1,"('Bathurst Manor', 'Downsview North', 'Wilson ...",Coffee Shop,Diner,Restaurant,Supermarket,Sushi Restaurant,Fast Food Restaurant,Sandwich Place,Bank,Middle Eastern Restaurant,Pizza Place
2,"('Bayview Village',)",Chinese Restaurant,Bank,Japanese Restaurant,Café,Women's Store,Dance Studio,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store
3,"('Bedford Park', 'Lawrence Manor East')",Coffee Shop,Juice Bar,Italian Restaurant,Fast Food Restaurant,Pizza Place,Thai Restaurant,Restaurant,Liquor Store,Pub,Indian Restaurant
4,"('Berczy Park',)",Beer Bar,Farmers Market,Cocktail Bar,Café,Seafood Restaurant,Bistro,Bakery,Italian Restaurant,Basketball Stadium,Jazz Club


In [0]:
# cluster neighbourhoods

# set number of clusters
kclusters = 7

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([6, 6, 6, 6, 6, 6, 6, 1, 6, 6], dtype=int32)

In [0]:
# add clustering labels
neighborhoods_venues_sorted.insert(1, 'Cluster Labels', kmeans.labels_)

In [0]:
# change data type for merge

df_e['Neighbourhood'] = df_e['Neighbourhood'].apply(tuple)
df_e['Neighbourhood'] = df_e['Neighbourhood'].astype(str)
df_e.head()

Unnamed: 0,Postcode,Latitude,Longitude,Borough,Neighbourhood
0,M2H,43.803762,-79.363452,North York,"('Hillcrest Village',)"
1,M2J,43.778517,-79.346556,North York,"('Fairview', 'Henry Farm', 'Oriole')"
2,M2K,43.786947,-79.385975,North York,"('Bayview Village',)"
3,M2L,43.75749,-79.374714,North York,"('Silver Hills', 'York Mills')"
4,M2M,43.789053,-79.408493,North York,"('Newtonbrook', 'Willowdale')"


In [0]:
neighborhoods_venues_sorted.tail()

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
66,"('Willowdale South',)",6,Coffee Shop,Ramen Restaurant,Café,Restaurant,Sandwich Place,Movie Theater,Middle Eastern Restaurant,Pet Store,Plaza,Lounge
67,"('Willowdale West',)",4,Pharmacy,Pizza Place,Coffee Shop,Discount Store,Women's Store,Creperie,Dog Run,Diner,Dim Sum Restaurant,Dessert Shop
68,"('Woodbine Gardens', 'Parkview Hill')",6,Fast Food Restaurant,Pizza Place,Gastropub,Athletics & Sports,Pet Store,Pharmacy,Intersection,Gym / Fitness Center,Café,Bank
69,"('Woodbine Heights',)",6,Skating Rink,Cosmetics Shop,Dance Studio,Curling Ice,Beer Store,Park,Bus Stop,Video Store,Pharmacy,Coffee Shop
70,"('York Mills West',)",1,Park,Bank,Convenience Store,Curling Ice,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant


In [0]:
# rename column for merge

neighborhoods_venues_sorted.rename(columns = {"Neighborhood": "Neighbourhood"}, 
                                 inplace = True) 
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"('Adelaide', 'King', 'Richmond')",6,Steakhouse,Café,Hotel,Pizza Place,Asian Restaurant,Gym / Fitness Center,Breakfast Spot,Seafood Restaurant,Smoke Shop,Lounge
1,"('Bathurst Manor', 'Downsview North', 'Wilson ...",6,Coffee Shop,Diner,Restaurant,Supermarket,Sushi Restaurant,Fast Food Restaurant,Sandwich Place,Bank,Middle Eastern Restaurant,Pizza Place
2,"('Bayview Village',)",6,Chinese Restaurant,Bank,Japanese Restaurant,Café,Women's Store,Dance Studio,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store
3,"('Bedford Park', 'Lawrence Manor East')",6,Coffee Shop,Juice Bar,Italian Restaurant,Fast Food Restaurant,Pizza Place,Thai Restaurant,Restaurant,Liquor Store,Pub,Indian Restaurant
4,"('Berczy Park',)",6,Beer Bar,Farmers Market,Cocktail Bar,Café,Seafood Restaurant,Bistro,Bakery,Italian Restaurant,Basketball Stadium,Jazz Club


In [0]:
# merge dataset and check output

toronto_york__merged = pd.merge(df_e, neighborhoods_venues_sorted, on='Neighbourhood')
toronto_york__merged.tail()

Unnamed: 0,Postcode,Latitude,Longitude,Borough,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
66,M6S,43.651571,-79.48445,West Toronto,"('Runnymede', 'Swansea')",6,Café,Coffee Shop,Pizza Place,Italian Restaurant,Sushi Restaurant,Bookstore,Tea Room,Indie Movie Theater,Smoothie Shop,Falafel Restaurant
67,M7Y,43.662744,-79.321558,East Toronto,('Business Reply Mail Processing Centre 969 Ea...,6,Light Rail Station,Yoga Studio,Auto Workshop,Fast Food Restaurant,Farmers Market,Comic Shop,Park,Pizza Place,Burrito Place,Recording Studio
68,M9L,43.756303,-79.565963,North York,"('Humber Summit',)",4,Empanada Restaurant,Pizza Place,Creperie,Dumpling Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store
69,M9M,43.724766,-79.532242,North York,"('Emery', 'Humberlea')",5,Baseball Field,Women's Store,Empanada Restaurant,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop
70,M9N,43.706876,-79.518188,York,"('Weston',)",1,Park,Convenience Store,Women's Store,Cuban Restaurant,Dumpling Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop


In [0]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_york__merged['Latitude'], toronto_york__merged['Longitude'], toronto_york__merged['Neighbourhood'], toronto_york__merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [0]:
# cluster 0
toronto_york__merged.loc[toronto_york__merged['Cluster Labels'] == 0, toronto_york__merged.columns[[1] + list(range(5, toronto_york__merged.shape[1]))]]

Unnamed: 0,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
45,43.711695,0,Garden,Women's Store,Creperie,Dumpling Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store


In [0]:
# cluster 1
toronto_york__merged.loc[toronto_york__merged['Cluster Labels'] == 1, toronto_york__merged.columns[[1] + list(range(5, toronto_york__merged.shape[1]))]]

Unnamed: 0,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,43.752758,1,Park,Bank,Convenience Store,Curling Ice,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant
7,43.753259,1,Park,Fast Food Restaurant,Food & Drink Shop,Women's Store,Cuban Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop
12,43.737473,1,Park,Airport,Cuban Restaurant,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop
22,43.685347,1,Park,Coffee Shop,Convenience Store,Women's Store,Cuban Restaurant,Dumpling Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant
26,43.72802,1,Park,Swim School,Bus Line,Lake,Curling Ice,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store,Diner
32,43.679563,1,Park,Playground,Trail,Creperie,Dumpling Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop
46,43.696948,1,Park,Jewelry Store,Trail,Sushi Restaurant,Women's Store,Cuban Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant
56,43.689026,1,Park,Fast Food Restaurant,Pharmacy,Market,Women's Store,Garden,Costume Shop,Dim Sum Restaurant,Dessert Shop,Gastropub
61,43.713756,1,Park,Basketball Court,Bakery,Construction & Landscaping,Curling Ice,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store,Diner
70,43.706876,1,Park,Convenience Store,Women's Store,Cuban Restaurant,Dumpling Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop


In [0]:
# cluster 2
toronto_york__merged.loc[toronto_york__merged['Cluster Labels'] == 2, toronto_york__merged.columns[[1] + list(range(5, toronto_york__merged.shape[1]))]]

Unnamed: 0,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,43.789053,2,Piano Bar,Women's Store,Creperie,Dumpling Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store


In [0]:
# cluster 3
toronto_york__merged.loc[toronto_york__merged['Cluster Labels'] == 3, toronto_york__merged.columns[[1] + list(range(5, toronto_york__merged.shape[1]))]]

Unnamed: 0,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,43.728496,3,Food Truck,Baseball Field,Home Service,Women's Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant


In [0]:
# cluster 4
toronto_york__merged.loc[toronto_york__merged['Cluster Labels'] == 4, toronto_york__merged.columns[[1] + list(range(5, toronto_york__merged.shape[1]))]]

Unnamed: 0,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,43.782736,4,Pharmacy,Pizza Place,Coffee Shop,Discount Store,Women's Store,Creperie,Dog Run,Diner,Dim Sum Restaurant,Dessert Shop
16,43.725882,4,Coffee Shop,Pizza Place,Portuguese Restaurant,Hockey Arena,Women's Store,Creperie,Dog Run,Discount Store,Diner,Dim Sum Restaurant
63,43.673185,4,Bus Line,Grocery Store,Pizza Place,Caribbean Restaurant,Brewery,Women's Store,Dog Run,Discount Store,Diner,Dim Sum Restaurant
68,43.756303,4,Empanada Restaurant,Pizza Place,Creperie,Dumpling Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store


In [0]:
# cluster 5
toronto_york__merged.loc[toronto_york__merged['Cluster Labels'] == 5, toronto_york__merged.columns[[1] + list(range(5, toronto_york__merged.shape[1]))]]

Unnamed: 0,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
69,43.724766,5,Baseball Field,Women's Store,Empanada Restaurant,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop


In [0]:
# cluster 6
toronto_york__merged.loc[toronto_york__merged['Cluster Labels'] == 6, toronto_york__merged.columns[[1] + list(range(5, toronto_york__merged.shape[1]))]]

Unnamed: 0,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,43.803762,6,Golf Course,Dog Run,Fast Food Restaurant,Pool,Mediterranean Restaurant,Cuban Restaurant,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop
1,43.778517,6,Clothing Store,Coffee Shop,Pharmacy,Salon / Barbershop,Japanese Restaurant,Department Store,Shopping Mall,Electronics Store,Smoothie Shop,Bank
2,43.786947,6,Chinese Restaurant,Bank,Japanese Restaurant,Café,Women's Store,Dance Studio,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store
4,43.77012,6,Coffee Shop,Ramen Restaurant,Café,Restaurant,Sandwich Place,Movie Theater,Middle Eastern Restaurant,Pet Store,Plaza,Lounge
8,43.745906,6,Basketball Court,Gym / Fitness Center,Caribbean Restaurant,Japanese Restaurant,Café,Women's Store,Curling Ice,Dumpling Restaurant,Dog Run,Discount Store
9,43.7259,6,Beer Store,Gym,Asian Restaurant,Coffee Shop,Bike Shop,Fast Food Restaurant,Clothing Store,Chinese Restaurant,Café,Dim Sum Restaurant
10,43.754328,6,Coffee Shop,Diner,Restaurant,Supermarket,Sushi Restaurant,Fast Food Restaurant,Sandwich Place,Bank,Middle Eastern Restaurant,Pizza Place
11,43.76798,6,Coffee Shop,Falafel Restaurant,Bar,Furniture / Home Store,Massage Studio,Curling Ice,Dumpling Restaurant,Dog Run,Discount Store,Diner
13,43.739015,6,Hotel,Park,Shopping Mall,Grocery Store,Bank,Moving Target,Women's Store,Discount Store,Diner,Dim Sum Restaurant
15,43.761631,6,Liquor Store,Grocery Store,Discount Store,Athletics & Sports,Women's Store,Curling Ice,Dumpling Restaurant,Dog Run,Diner,Dim Sum Restaurant


In [0]:
# cluster 7
toronto_york__merged.loc[toronto_york__merged['Cluster Labels'] == 7, toronto_york__merged.columns[[1] + list(range(5, toronto_york__merged.shape[1]))]]

Unnamed: 0,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
