### Scrape the Wikipedia webpage, wrangle data, clean data and read data into a *pandas* Dataframe

In [1]:
# Import requests library to send a request to the Wikipedia webpage´s server to retrieve data displayed on the webpage

import requests

In [2]:
# Retrieve the raw HTML 

website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

In [3]:
# Import BeautifulSoup library to parse data, to extract data from the HTML page

from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url,"lxml")
# print(soup.prettify())

In [4]:
# Extract all data from the table - class wikitable sortable, where data is stored.

my_table = soup.find("table",{"class":"wikitable sortable"})
# my_table

In [5]:
# Find the header attributes of the table, starting and ending with "th"

header = [th.text.rstrip() for th in my_table.find_all("th")]
print(header)

['Postal Code', 'Borough', 'Neighbourhood']


In [6]:
# Remove spaces in headers´ name

for i in range(0, len(header)):
    header[i] = header[i].replace(" ","")
header

['PostalCode', 'Borough', 'Neighbourhood']

In [7]:
# Scrap the data from table and append it to a empty list, one list for each column in table; remove blank spaces, 
# if any, using strip.

c1=[]
c2=[]
c3=[]

for row in my_table.find_all("tr"):
    cells = row.find_all("td")
    if len(cells)==3:  
        c1.append(cells[0].find(text=True).strip())
        c2.append(cells[1].find(text=True).strip())
        c3.append(cells[2].find(text=True).strip())

In [8]:
# Check if data looks ok.

print("First column data:{}".format(c1))
print("\n")
print("Second column data:{}".format(c2))
print("\n")
print("Third column data:{}".format(c3))
print("\n")

First column data:['M1A', 'M2A', 'M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M8A', 'M9A', 'M1B', 'M2B', 'M3B', 'M4B', 'M5B', 'M6B', 'M7B', 'M8B', 'M9B', 'M1C', 'M2C', 'M3C', 'M4C', 'M5C', 'M6C', 'M7C', 'M8C', 'M9C', 'M1E', 'M2E', 'M3E', 'M4E', 'M5E', 'M6E', 'M7E', 'M8E', 'M9E', 'M1G', 'M2G', 'M3G', 'M4G', 'M5G', 'M6G', 'M7G', 'M8G', 'M9G', 'M1H', 'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M7H', 'M8H', 'M9H', 'M1J', 'M2J', 'M3J', 'M4J', 'M5J', 'M6J', 'M7J', 'M8J', 'M9J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M7K', 'M8K', 'M9K', 'M1L', 'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M7L', 'M8L', 'M9L', 'M1M', 'M2M', 'M3M', 'M4M', 'M5M', 'M6M', 'M7M', 'M8M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N', 'M6N', 'M7N', 'M8N', 'M9N', 'M1P', 'M2P', 'M3P', 'M4P', 'M5P', 'M6P', 'M7P', 'M8P', 'M9P', 'M1R', 'M2R', 'M3R', 'M4R', 'M5R', 'M6R', 'M7R', 'M8R', 'M9R', 'M1S', 'M2S', 'M3S', 'M4S', 'M5S', 'M6S', 'M7S', 'M8S', 'M9S', 'M1T', 'M2T', 'M3T', 'M4T', 'M5T', 'M6T', 'M7T', 'M8T', 'M9T', 'M1V', 'M2V', 'M3V', 'M4V', 'M5V', '

In [9]:
# Create a dictionary with header elements as key.

d = dict([(x,0) for x in header])
d

{'PostalCode': 0, 'Borough': 0, 'Neighbourhood': 0}

In [10]:
# Add data from lists created into dictionary
d["PostalCode"] = c1
d["Borough"] = c2
d["Neighbourhood"] = c3

# Convert dictionary to Dataframe
import pandas as pd

df = pd.DataFrame(d)
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [11]:
# Drop the rows that contains "Not assigned" values in Borough column

df_final = df[df.Borough!="Not assigned"]
df_final

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [12]:
# Reset the index. We can see that our dataframe has less rows (before 180 rows and after cleaning it has 103 rows).

df_final.reset_index(drop=True)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [13]:
# Check if there are duplicate values for PostalCode.

len(df_final["PostalCode"].unique())   # Still showing 103 rows which means there are no duplicates in PostalCode column

103

In [14]:
# Check if there are rows with "Not assigned" value in Neighbourhood column

"Not assigned" in df.Neighbourhood  # There are no rows with "Not assigned" value in Neighbourhood column

False

In [15]:
# Final format of the dataframe - number of rows (103) and columns (3)

df_final.shape

(103, 3)

### Get the latitude and the longitude coordinates of each neighborhood

In [16]:
# Read the geospational coordinates from csv file

lat_long = pd.read_csv("Geospatial_Coordinates.csv")
lat_long

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [17]:
lat_long.rename(columns={"Postal Code":"PostalCode"}, inplace=True)
lat_long

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [18]:
# Add the lat and long coordinated in the dataframe

df_finalmerged = df_final
df_finalmerged = df_finalmerged.join(lat_long.set_index('PostalCode'), on='PostalCode')

In [19]:
df_finalmerged

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
165,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [20]:
# Count the number of number_neighbourhoods in each borough
number_neighbourhood = df_finalmerged.groupby("Borough").count()
number_neighbourhood

Unnamed: 0_level_0,PostalCode,Neighbourhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,19,19,19,19
East Toronto,5,5,5,5
East York,5,5,5,5
Etobicoke,12,12,12,12
Mississauga,1,1,1,1
North York,24,24,24,24
Scarborough,17,17,17,17
West Toronto,6,6,6,6
York,5,5,5,5


### Get the latitude and longitude values of Toronto

In [21]:
# Use geopy to convert an address into latitude and longitude values
# !conda install -c conda-forge geopy --yes 

In [22]:
from geopy.geocoders import Nominatim 

In order to define an instance of the geocoder, we need to define a user_agent and will name it *toronto_agent*.

In [23]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


#### Create a map of Toronto with neighborhoods superimposed on top.

In [24]:
# Import Folium visualization library
import folium 

In [25]:
# create map of Toronto using latitude and longitude values
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)


# Add markers to map
for lat, lng, borough, df_final in zip(df_finalmerged['Latitude'], df_finalmerged['Longitude'], df_finalmerged['Borough'], 
                                           df_finalmerged['Neighbourhood']):
    label = '{}, {}'.format(df_finalmerged, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

### Explore Neighborhoods in Toronto - North York borough

#### Segment and cluster the North York borough . We will slice the original dataframe and create a new dataframe with only North York borough.

In [26]:
NorthYork = df_finalmerged[df_finalmerged['Borough'] == 'North York'].reset_index(drop=True)

In [27]:
NorthYork.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
3,M3B,North York,Don Mills,43.745906,-79.352188
4,M6B,North York,Glencairn,43.709577,-79.445073


In [28]:
NorthYork.shape

(24, 5)

##### Get the coordinates of North York

In [29]:
address_northy = 'North York, Toronto'

geolocator_northy = Nominatim(user_agent="NorthYork_agent")
location_northy = geolocator_northy.geocode(address_northy)
latitude_northy = location_northy.latitude
longitude_northy = location_northy.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude_northy, longitude_northy))

The geograpical coordinate of North York are 43.7543263, -79.44911696639593.


##### Visualize the neighborhoods in North York.

In [30]:
map_NorthYork = folium.Map(location=[latitude_northy, longitude_northy], zoom_start=11)

for lat, lng, label in zip(NorthYork['Latitude'], NorthYork['Longitude'], NorthYork['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NorthYork)  
    
map_NorthYork

### Use Foursquare API to explore the neighborhoods and segment them

#### Define Foursquare Credentials and Version 

In [31]:
CLIENT_ID = '05M2JXCYWFFUP4EWFNRNA4ISQ4DN1QP32AYQPHYPNCEE3G0X'
CLIENT_SECRET = 'KJEZHZVVM2BJAYGA0RL0RDTQV5EN2Z424XEAFEGUSTWEMZXF'
VERSION = '20180605'
LIMIT = 100

print('Credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Credentails:
CLIENT_ID: 05M2JXCYWFFUP4EWFNRNA4ISQ4DN1QP32AYQPHYPNCEE3G0X
CLIENT_SECRET:KJEZHZVVM2BJAYGA0RL0RDTQV5EN2Z424XEAFEGUSTWEMZXF


#### Analyze and explore the neighbourhood in North York. Create the GET request URL and get relevant information for each nearby venue in a radius of 500 meters.

In [32]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Using the above function, we will run it for each neighbourhood and create a new dataframe with all venues returned by Foursquare called NorthYork_venues

In [33]:
NorthYork_venues = getNearbyVenues(names=NorthYork['Neighbourhood'],
                                       latitudes=NorthYork['Latitude'],
                                       longitudes=NorthYork['Longitude'])

Parkwoods
Victoria Village
Lawrence Manor, Lawrence Heights
Don Mills
Glencairn
Don Mills
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Fairview, Henry Farm, Oriole
Northwood Park, York University
Bayview Village
Downsview
York Mills, Silver Hills
Downsview
North Park, Maple Leaf Park, Upwood Park
Humber Summit
Willowdale, Newtonbrook
Downsview
Bedford Park, Lawrence Manor East
Humberlea, Emery
Willowdale, Willowdale East
Downsview
York Mills West
Willowdale, Willowdale West


#### Check the size of the resulting dataframe

In [34]:
print(NorthYork_venues.shape)
NorthYork_venues.head()

(240, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


#### Get how many venues were returned for each neighbourhood

In [35]:
NorthYork_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Manor, Wilson Heights, Downsview North",21,21,21,21,21,21
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",22,22,22,22,22,22
Don Mills,23,23,23,23,23,23
Downsview,16,16,16,16,16,16
"Fairview, Henry Farm, Oriole",70,70,70,70,70,70
Glencairn,5,5,5,5,5,5
Hillcrest Village,5,5,5,5,5,5
Humber Summit,2,2,2,2,2,2
"Humberlea, Emery",1,1,1,1,1,1


#### Check how many unique categories can be curated from all the returned venues


In [36]:
print('There are {} uniques categories.'.format(len(NorthYork_venues['Venue Category'].unique())))

There are 102 uniques categories.


### Analyze each neighbourhood

In [37]:
# one hot encoding
NorthYork_onehot = pd.get_dummies(NorthYork_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighbourhood column back to dataframe
NorthYork_onehot['Neighbourhood'] = NorthYork_venues['Neighbourhood'] 

# move neighbourhood column to the first column
fixed_columns = [NorthYork_onehot.columns[-1]] + list(NorthYork_onehot.columns[:-1])
NorthYork_onehot = NorthYork_onehot[fixed_columns]

NorthYork_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,...,Supermarket,Supplement Shop,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Vietnamese Restaurant,Women's Store
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
NorthYork_onehot.shape

(240, 103)

#### Group rows by neighbourhood and by taking the mean of the frequency of occurrence of each category

In [39]:
NorthYork_grouped = NorthYork_onehot.groupby('Neighbourhood').mean().reset_index()
NorthYork_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,...,Supermarket,Supplement Shop,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Vietnamese Restaurant,Women's Store
0,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095238,...,0.047619,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.045455,0.0,0.045455,0.0,0.0,0.0,0.0,0.0
3,Don Mills,0.0,0.0,0.0,0.043478,0.0,0.043478,0.0,0.0,0.0,...,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Downsview,0.0,0.0625,0.0,0.0,0.0,0.0,0.0625,0.0,0.0625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
NorthYork_grouped.shape

(19, 103)

#### Let's print each neighbourhood along with the top 5 most common venues

In [41]:
num_top_venues = 5

for hood in NorthYork_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = NorthYork_grouped[NorthYork_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor, Wilson Heights, Downsview North----
            venue  freq
0     Coffee Shop  0.10
1            Bank  0.10
2  Ice Cream Shop  0.05
3     Bridal Shop  0.05
4   Deli / Bodega  0.05


----Bayview Village----
                 venue  freq
0   Chinese Restaurant  0.25
1                 Café  0.25
2                 Bank  0.25
3  Japanese Restaurant  0.25
4    Accessories Store  0.00


----Bedford Park, Lawrence Manor East----
                venue  freq
0      Sandwich Place  0.09
1  Italian Restaurant  0.09
2         Coffee Shop  0.09
3       Grocery Store  0.05
4    Greek Restaurant  0.05


----Don Mills----
                 venue  freq
0                  Gym  0.13
1          Coffee Shop  0.09
2  Japanese Restaurant  0.09
3           Beer Store  0.09
4   Dim Sum Restaurant  0.04


----Downsview----
            venue  freq
0   Grocery Store  0.19
1            Park  0.12
2  Baseball Field  0.06
3           Hotel  0.06
4         Airport  0.06


----Fairview, Henry Farm, Or

#### Put that data into a _pandas_ dataframe

In [42]:
import numpy as np

In [43]:
# A function to sort the venues in descending order.

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

# Display the top 10 venues for each neighborhood

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = NorthYork_grouped['Neighbourhood']

for ind in np.arange(NorthYork_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(NorthYork_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Mobile Phone Shop,Sandwich Place,Gas Station,Diner,Ice Cream Shop,Deli / Bodega,Chinese Restaurant,Middle Eastern Restaurant
1,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Women's Store,Discount Store,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
2,"Bedford Park, Lawrence Manor East",Sandwich Place,Italian Restaurant,Coffee Shop,Greek Restaurant,Grocery Store,Indian Restaurant,Juice Bar,Liquor Store,Locksmith,Comfort Food Restaurant
3,Don Mills,Gym,Beer Store,Coffee Shop,Japanese Restaurant,Caribbean Restaurant,Clothing Store,Chinese Restaurant,Café,Dim Sum Restaurant,Italian Restaurant
4,Downsview,Grocery Store,Park,Bank,Liquor Store,Hotel,Shopping Mall,Home Service,Baseball Field,Business Service,Gym / Fitness Center


### Cluster Neighbourhoods

#### Run _k_-means to cluster the neighbourhood into 5 clusters.

In [44]:
# Import k-means from clustering stage
from sklearn.cluster import KMeans


In [45]:
# Set number of clusters, run clustering and check cluster labels
kclusters = 5

NorthYork_grouped_clustering = NorthYork_grouped.drop('Neighbourhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(NorthYork_grouped_clustering)

kmeans.labels_

array([4, 4, 4, 4, 4, 4, 4, 4, 3, 2, 4, 0, 4, 0, 4, 4, 4, 0, 1])

#### Create a new dataframe that includes the cluster as well as the top 10 venues for each neighbourhood.

In [46]:
neighbourhoods_venues_sorted.insert(0, 'ClusterLabels', kmeans.labels_)

In [47]:
neighbourhoods_venues_sorted2 = neighbourhoods_venues_sorted
neighbourhoods_venues_sorted2.drop(['ClusterLabels'], axis=1, inplace=True)

In [48]:
NorthYork_merged = NorthYork
NorthYork_merged = NorthYork_merged.join(neighbourhoods_venues_sorted2.set_index('Neighbourhood'), on='Neighbourhood')
NorthYork_merged = NorthYork_merged.dropna()

In [49]:
NorthYork_merged = NorthYork_merged.dropna()

In [50]:
# Add clustering labels to dataset

neighbourhoods_venues_sorted.insert(0, 'ClusterLabels', kmeans.labels_)
neighbourhoods_venues_sorted.drop(neighbourhoods_venues_sorted.columns[2:12], axis=1, inplace = True)

In [51]:
# Merge NorthYork_merged with NorthYork data to add latitude/longitude for each neighbourhood

NorthYork_merged = NorthYork_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

NorthYork_merged.reset_index(drop=True)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,ClusterLabels
0,M3A,North York,Parkwoods,43.753259,-79.329656,Food & Drink Shop,Park,Women's Store,Diner,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,0
1,M4A,North York,Victoria Village,43.725882,-79.315572,Coffee Shop,Pizza Place,Hockey Arena,Portuguese Restaurant,Intersection,Diner,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,4
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,Clothing Store,Women's Store,Vietnamese Restaurant,Boutique,Coffee Shop,Event Space,Furniture / Home Store,Gift Shop,Accessories Store,Supermarket,4
3,M3B,North York,Don Mills,43.745906,-79.352188,Gym,Beer Store,Coffee Shop,Japanese Restaurant,Caribbean Restaurant,Clothing Store,Chinese Restaurant,Café,Dim Sum Restaurant,Italian Restaurant,4
4,M6B,North York,Glencairn,43.709577,-79.445073,Park,Bakery,Pizza Place,Pub,Japanese Restaurant,Food Truck,Dessert Shop,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,4
5,M3C,North York,Don Mills,43.7259,-79.340923,Gym,Beer Store,Coffee Shop,Japanese Restaurant,Caribbean Restaurant,Clothing Store,Chinese Restaurant,Café,Dim Sum Restaurant,Italian Restaurant,4
6,M2H,North York,Hillcrest Village,43.803762,-79.363452,Dog Run,Athletics & Sports,Mediterranean Restaurant,Pool,Golf Course,Fried Chicken Joint,Food Truck,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,4
7,M3H,North York,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259,Coffee Shop,Bank,Mobile Phone Shop,Sandwich Place,Gas Station,Diner,Ice Cream Shop,Deli / Bodega,Chinese Restaurant,Middle Eastern Restaurant,4
8,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,Clothing Store,Coffee Shop,Fast Food Restaurant,Juice Bar,Japanese Restaurant,Women's Store,Food Court,Shoe Store,Bakery,Bank,4
9,M3J,North York,"Northwood Park, York University",43.76798,-79.487262,Furniture / Home Store,Miscellaneous Shop,Caribbean Restaurant,Metro Station,Massage Studio,Bar,Coffee Shop,Women's Store,Dim Sum Restaurant,Construction & Landscaping,4


#### Visualize the resulting clusters

In [52]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors


In [53]:
# create map
map_clusters = folium.Map(location=[latitude_northy, longitude_northy], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = ["darkblue","red","green","darkorange","purple"]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(NorthYork_merged['Latitude'],NorthYork_merged['Longitude'],
                                  NorthYork_merged['Neighbourhood'], NorthYork_merged['ClusterLabels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color = rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters