# Step 1 - Read Data From Wikipedia

In [10]:
# main documentation page: http://beautiful-soup-4.readthedocs.io/en/latest/
# how to use the BeautifulSoup package: https://www.youtube.com/watch?v=ng2o98k983k video
from bs4 import BeautifulSoup 
import pandas as pd

### Read Postcode Data from Wikipeadia


In [11]:
# read data

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

source = requests.get(url).text
soup = BeautifulSoup(source,'lxml')

table = soup.find('table')
#table.prettify()

### Create Dataframe

In [12]:
# parse data and create dataframe 
postcode =[]
borough =[]
neighbourhood =[]

for tr in table.find_all('tr'):
    i = 1 
    for td in tr.find_all('td'):
        if i == 1:
            postcode.append(td.text)    
        elif i == 2:    
            borough.append(td.text)
        elif i == 3:        
            neighbourhood.append(td.text.replace('\n',''))
        i +=1
        
dict = {'PostalCode': postcode, 'Borough': borough, 'Neighbourhood': neighbourhood}        

df = pd.DataFrame.from_dict(dict)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Remove Borough = 'Not assigned'

In [13]:
# delete  rows with Borough = 'Not assigned'
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Generate Groupby Postcode Result DataFrame

In [14]:
# group by Postcode and generate sets for Borough,Neighbourhood 
boroughLst = df.groupby(['PostalCode'])['Borough'].apply(set)
neighbourhoodLst = df.groupby(['PostalCode'])['Neighbourhood'].apply(set)

# create df_result by mergin 2 sets on join Postcode
df_result = pd.merge(pd.DataFrame(boroughLst), pd.DataFrame(neighbourhoodLst), how='outer', on=['PostalCode'])

# reset index to make Postcode columns 
df_result.reset_index(inplace=True)

# convert sets to String 
for i in range(len(df_result)-1):
    df_result.Neighbourhood[i] = ', '.join(df_result.Neighbourhood[i])
    df_result.Borough[i] = ', '.join(df_result.Borough[i])

# Update    Neighbourhood = 'Not assigned' with  Borough value. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 
def myFun(Neighbourhood, Borough):
    if Neighbourhood =='Not assigned':
        return Borough
    else:
        return Neighbourhood
df_result.Neighbourhood = df_result.apply(lambda row :  myFun(row.Neighbourhood, row.Borough) , axis = 1) 

# show head 
df_result.head(12)


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Morningside, West Hill, Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [15]:
# M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma 
df_result[df_result.PostalCode =='M5A']

Unnamed: 0,PostalCode,Borough,Neighbourhood
53,M5A,Downtown Toronto,"Harbourfront, Regent Park"


In [16]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.
df_result[df_result.PostalCode =='M7A']

Unnamed: 0,PostalCode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


### Print Shape Result

In [17]:
print('df_result.shape : ', df_result.shape)

df_result.shape :  (103, 3)


# Step 2 - Read Geocodes 

### Get geographical coordinates of each postal code

In [18]:
# read geocode coordinates from csv file
df_geocodes = pd.read_csv('http://cocl.us/Geospatial_data')
df_geocodes.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
df_geocodes.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge df_geocodes with df_result 

In [19]:
#Merge df_geocodes with df_result 
df_result_geo = pd.merge(df_result, df_geocodes, how='outer', on=['PostalCode'])

# reset index to make PostalCode columns 
df_result_geo.reset_index(inplace=True)
df_result_geo = df_result_geo[['PostalCode','Borough','Neighbourhood', 'Latitude','Longitude']]
df_result_geo.head(12)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Morningside, West Hill, Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# Step 3 - Analyze Neighborhood

## 1. Download and Explore Dataset

### Generate Dataframe with Boroughs Contains Toronto

In [20]:
# generate df_toronto boroughs that contain the word Toronto
df_toronto = df_result_geo[df_result_geo.apply(lambda row : 'Toronto' in  row.Borough , axis = 1) ]
df_toronto.reset_index(inplace=True)
df_toronto = df_toronto[['PostalCode','Borough','Neighbourhood', 'Latitude','Longitude']]

print('df_toronto.shape : ', df_toronto.shape)
df_toronto.head()

df_toronto.shape :  (38, 5)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"Riverdale, The Danforth West",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


### Create Map

In [21]:
#!conda install -c conda-forge folium=0.5.0
import folium # map rendering library
from geopy.geocoders import Nominatim

In [22]:
address = 'Toronto, Ontario, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [31]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Define Foursquare Credentials and Version

In [2]:
#@hidden cell
CLIENT_ID = 'PR4BTKNRIZEBQKKAW45YMI4R0E0J12EG1LBMSHSQFPGQTHLB' # your Foursquare ID
CLIENT_SECRET = 'KDJBDP01YBJ5JIH2IGFVR2V2T02AF4RNSCT45EL5LZRO32OL' # your Foursquare Secret
VERSION = '20191003' # Foursquare API version


#### Now, let's get the top 100 venues that are in Toronto within a radius of 500 meters.

In [25]:
# type your answer here
radius = 500 # define radius
LIMIT = 100 # limit of number of venues returned by Foursquare API
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
url # display URL


'https://api.foursquare.com/v2/venues/explore?&client_id=PR4BTKNRIZEBQKKAW45YMI4R0E0J12EG1LBMSHSQFPGQTHLB&client_secret=KDJBDP01YBJ5JIH2IGFVR2V2T02AF4RNSCT45EL5LZRO32OL&v=20191003&ll=43.653963,-79.387207&radius=500&limit=100'

In [32]:
import requests 
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d972ffedd70c5002cba3b54'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Bay Street Corridor',
  'headerFullLocation': 'Bay Street Corridor, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 77,
  'suggestedBounds': {'ne': {'lat': 43.6584630045, 'lng': -79.38099903084075},
   'sw': {'lat': 43.649462995499995, 'lng': -79.39341496915925}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '5227bb01498e17bf485e6202',
       'name': 'Downtown Toronto',
       'location': {'lat': 43.65323167517444,
        'lng': -79.38529600606677,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.65323167517444,
          

From the Foursquare lab in the previous module, we know that all the information is in the *items* key. Before we proceed, let's borrow the **get_category_type** function from the Foursquare lab.

In [33]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a *pandas* dataframe.

In [34]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Downtown Toronto,Neighborhood,43.653232,-79.385296
1,Japango,Sushi Restaurant,43.655268,-79.385165
2,Sansotei Ramen 三草亭,Ramen Restaurant,43.655157,-79.386501
3,Cafe Plenty,Café,43.654571,-79.38945
4,Poke Guys,Poke Place,43.654895,-79.385052


In [35]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

77 venues were returned by Foursquare.


## 2. Explore Neighborhoods in Toronto

#### Let's create a function to repeat the same process to all the neighborhoods in Manhattan

In [36]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe called *toronta_venues*.

In [42]:

toronta_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )



The Beaches
Riverdale, The Danforth West
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Summerhill East, Moore Park
Deer Park, Rathnelly, Forest Hill SE, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
King, Adelaide, Richmond
Toronto Islands, Harbourfront East, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill West, Forest Hill North
North Midtown, The Annex, Yorkville
University of Toronto, Harbord
Kensington Market, Grange Park, Chinatown
Harbourfront West, King and Spadina, Railway Lands, South Niagara, CN Tower, Island airport, Bathurst Quay
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Trinity, Little Portugal
Exhibition Place, Parkdale Village, Brockton
The Junction So

In [43]:
print(toronta_venues.shape)
toronta_venues.head()

(1712, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Glen Stewart Park,43.675278,-79.294647,Park
4,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors


In [44]:
toronta_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,57,57,57,57,57,57
Business Reply Mail Processing Centre 969 Eastern,15,15,15,15,15,15
"Cabbagetown, St. James Town",44,44,44,44,44,44
Central Bay Street,86,86,86,86,86,86
Christie,16,16,16,16,16,16
Church and Wellesley,86,86,86,86,86,86
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,34,34,34,34,34,34
Davisville North,7,7,7,7,7,7
"Deer Park, Rathnelly, Forest Hill SE, South Hill, Summerhill West",15,15,15,15,15,15


#### Let's find out how many unique categories can be curated from all the returned venues

In [45]:
print('There are {} uniques categories.'.format(len(toronta_venues['Venue Category'].unique())))

There are 237 uniques categories.


## 3. Analyze Each Neighborhood

In [52]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronta_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronta_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
toronto_onehot.shape

(1712, 238)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each 

In [54]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0,0.0
1,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667
2,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011628,0.0,...,0.0,0.0,0.0,0.011628,0.0,0.0,0.011628,0.0,0.0,0.011628
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Church and Wellesley,0.011628,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,...,0.0,0.0,0.0,0.0,0.011628,0.011628,0.0,0.0,0.011628,0.011628
6,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0
7,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029412,0.0,...,0.029412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Deer Park, Rathnelly, Forest Hill SE, South Hi...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0


#### Let's print each neighborhood along with the top 5 most common venues

In [55]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                venue  freq
0         Coffee Shop  0.07
1        Cocktail Bar  0.05
2                Café  0.04
3  Seafood Restaurant  0.04
4            Beer Bar  0.04


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0         Yoga Studio  0.07
1  Light Rail Station  0.07
2                Park  0.07
3          Comic Shop  0.07
4    Recording Studio  0.07


----Cabbagetown, St. James Town----
                venue  freq
0         Coffee Shop  0.09
1                 Pub  0.05
2  Italian Restaurant  0.05
3                Café  0.05
4          Restaurant  0.05


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.15
1                Café  0.06
2  Italian Restaurant  0.05
3        Burger Joint  0.03
4      Ice Cream Shop  0.03


----Christie----
                venue  freq
0       Grocery Store  0.19
1                Café  0.19
2                Park  0.12
3  Italian Restaurant  0.06
4         Coffee Shop 

#### Let's put that into a *pandas* dataframe

First, let's write a function to sort the venues in descending order.

In [56]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [61]:
import numpy as np # library to handle data in a vectorized manner

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Farmers Market,Café,Cheese Shop,Beer Bar,Steakhouse,Seafood Restaurant,Bakery,Irish Pub
1,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Auto Workshop,Pizza Place,Recording Studio,Restaurant,Burrito Place,Skate Park,Brewery,Farmers Market,Fast Food Restaurant
2,"Cabbagetown, St. James Town",Coffee Shop,Park,Café,Restaurant,Italian Restaurant,Pub,Bakery,Pizza Place,Sandwich Place,Butcher
3,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Ice Cream Shop,Burger Joint,Sandwich Place,Japanese Restaurant,Gym / Fitness Center,Chinese Restaurant,Bar
4,Christie,Café,Grocery Store,Park,Convenience Store,Coffee Shop,Restaurant,Bank,Italian Restaurant,Diner,Nightclub


## 4. Cluster Neighborhoods

In [63]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


Run *k*-means to cluster the neighborhood into 5 clusters.

In [64]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [69]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,2,Health Food Store,Park,Other Great Outdoors,Neighborhood,Trail,Pub,Electronics Store,Doner Restaurant,Donut Shop,Dumpling Restaurant
1,M4K,East Toronto,"Riverdale, The Danforth West",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Bookstore,Brewery,Bubble Tea Shop,Café,Restaurant
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Sandwich Place,Pizza Place,Pet Store,Movie Theater,Brewery,Burger Joint,Burrito Place,Pub,Park,Fast Food Restaurant
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Italian Restaurant,American Restaurant,Bakery,Yoga Studio,Park,Brewery,Seafood Restaurant,Sandwich Place
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4,Photography Studio,Park,Bus Line,Swim School,Dog Run,Festival,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space


Finally, let's visualize the resulting clusters

In [70]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters

Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.

#### Cluster 1

In [71]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,East Toronto,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Bookstore,Brewery,Bubble Tea Shop,Café,Restaurant
2,East Toronto,0,Sandwich Place,Pizza Place,Pet Store,Movie Theater,Brewery,Burger Joint,Burrito Place,Pub,Park,Fast Food Restaurant
3,East Toronto,0,Café,Coffee Shop,Italian Restaurant,American Restaurant,Bakery,Yoga Studio,Park,Brewery,Seafood Restaurant,Sandwich Place
5,Central Toronto,0,Food & Drink Shop,Clothing Store,Breakfast Spot,Park,Gym,Sandwich Place,Hotel,Dumpling Restaurant,Eastern European Restaurant,Electronics Store
6,Central Toronto,0,Coffee Shop,Sporting Goods Shop,Burger Joint,Salon / Barbershop,Café,Restaurant,Rental Car Location,Chinese Restaurant,Clothing Store,Yoga Studio
7,Central Toronto,0,Sandwich Place,Dessert Shop,Café,Italian Restaurant,Gym,Sushi Restaurant,Coffee Shop,Pizza Place,Fried Chicken Joint,Deli / Bodega
9,Central Toronto,0,Pub,Coffee Shop,Bagel Shop,Light Rail Station,Sports Bar,Restaurant,Supermarket,Sushi Restaurant,Fried Chicken Joint,Liquor Store
11,Downtown Toronto,0,Coffee Shop,Park,Café,Restaurant,Italian Restaurant,Pub,Bakery,Pizza Place,Sandwich Place,Butcher
12,Downtown Toronto,0,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Gay Bar,Pizza Place,Bubble Tea Shop,Burger Joint,Hotel,Gym
13,Downtown Toronto,0,Coffee Shop,Park,Bakery,Café,Breakfast Spot,Restaurant,Mexican Restaurant,Pub,Theater,Beer Store


#### Cluster 2

In [73]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Central Toronto,1,Home Service,Garden,Yoga Studio,Doner Restaurant,Filipino Restaurant,Festival,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space


#### Cluster 3

In [75]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,2,Health Food Store,Park,Other Great Outdoors,Neighborhood,Trail,Pub,Electronics Store,Doner Restaurant,Donut Shop,Dumpling Restaurant
23,Central Toronto,2,Trail,Jewelry Store,Park,Sushi Restaurant,Ethiopian Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Yoga Studio


#### Cluster 4

In [76]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Central Toronto,3,Playground,Park,Restaurant,Yoga Studio,Discount Store,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant
10,Downtown Toronto,3,Park,Trail,Building,Playground,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Yoga Studio
