# This notebook creates a dataframe for neighborhoods in Toronto

In [262]:
import pandas as pd
import numpy as np

### Read html to scrape data from wikipedia page

In [263]:
df=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

In [264]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


### Removing not assigned boroughs

In [265]:
df=df[df.Borough !='Not assigned']

In [266]:
df.head(11)

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


### Change not assigned neighbourhoods to borough

In [268]:

df.Neighbourhood = np.where(df.Neighbourhood == 'Not assigned',df.Borough, df.Neighbourhood)

In [269]:
df.shape

(103, 3)

### Dataframe from geospatial data

In [270]:
latLon=pd.read_csv('http://cocl.us/Geospatial_data')

In [271]:
latLon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [272]:
frames=[df,latLon]

### Concatenate geospatial data and neighbourhood data

In [273]:
df2=pd.concat(frames,join='inner', axis=1)

In [274]:
df2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Postal Code.1,Latitude,Longitude
2,M3A,North York,Parkwoods,M1E,43.763573,-79.188711
3,M4A,North York,Victoria Village,M1G,43.770992,-79.216917
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",M1H,43.773136,-79.239476
5,M6A,North York,"Lawrence Manor, Lawrence Heights",M1J,43.744734,-79.239476
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M1K,43.727929,-79.262029


# Analysis of neighbourhoods

### Install packages

In [275]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation


!pip install geopy
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize


! pip install folium==0.5.0
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Folium installed
Libraries imported.


In [277]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


usage: conda-script.py [-h] [-V] command ...
conda-script.py: error: unrecognized arguments: # uncomment this line if you haven't completed the Foursquare API lab


### Define foursquare credentials

In [278]:
CLIENT_ID = 'TQUVPGPQRKY1VNDSPDXBH3KMYIEPOV02SJBPWHLPII0TKVKY' # your Foursquare ID
CLIENT_SECRET = 'F1KVZ2PAKTDFOMHUGF0F2V0L2JRZ03GT1BVRT0D4X3F4IUWZ' # your Foursquare Secret
ACCESS_TOKEN = 'PB4QI4H1PGYO5A20Q5EBG13Y4YF5CWQ3HHUFRPDZ42KBH1AP' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TQUVPGPQRKY1VNDSPDXBH3KMYIEPOV02SJBPWHLPII0TKVKY
CLIENT_SECRET:F1KVZ2PAKTDFOMHUGF0F2V0L2JRZ03GT1BVRT0D4X3F4IUWZ


### Get the geographical coordinates of Downtown Toronto

In [279]:
#Restrict dataframe to the Downtown Toronto Borough
dft = df2[df2.Borough == 'Downtown Toronto']


In [280]:
#Get the geographical coordinates
address = 'Downtown Toronto, ON, Canada'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

43.6563221 -79.3809161


### Create folium map of neighbourhoods

In [282]:
# create map of Downtown Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighbourhood in zip(dft['Latitude'], dft['Longitude'], dft['Borough'], dft['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Get top 100 venues within 500 meters radius

In [283]:
limit=100
radius=500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=TQUVPGPQRKY1VNDSPDXBH3KMYIEPOV02SJBPWHLPII0TKVKY&client_secret=F1KVZ2PAKTDFOMHUGF0F2V0L2JRZ03GT1BVRT0D4X3F4IUWZ&v=20180604&ll=43.6563221,-79.3809161&radius=500&limit=30'

In [185]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '6013855b4375590137e01edf'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Bay Street Corridor',
  'headerFullLocation': 'Bay Street Corridor, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 93,
  'suggestedBounds': {'ne': {'lat': 43.6608221045, 'lng': -79.37470788695488},
   'sw': {'lat': 43.651822095499995, 'lng': -79.3871243130451}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '57eda381498ebe0e6ef40972',
       'name': 'UNIQLO ユニクロ',
       'location': {'address': '220 Yonge St',
        'crossStreet': 'at Dundas St W',
        'lat': 43.65591027779457,
        'lng': -79.38064099181345,
        'labeledLatLngs'

In [284]:

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [285]:
#Run above function on each neighbourhood and make a new dataframe
toronto_venues = getNearbyVenues(names=dft['Neighbourhood'],
                                   latitudes=dft['Latitude'],
                                   longitudes=dft['Longitude']
                                  )

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel


In [286]:
print(toronto_venues.shape)
toronto_venues.head()

(184, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.773136,-79.239476,Federick Restaurant,43.774697,-79.241142,Hakka Restaurant
1,"Regent Park, Harbourfront",43.773136,-79.239476,Drupati's Roti & Doubles,43.775222,-79.241678,Caribbean Restaurant
2,"Regent Park, Harbourfront",43.773136,-79.239476,Thai One On,43.774468,-79.241268,Thai Restaurant
3,"Regent Park, Harbourfront",43.773136,-79.239476,Centennial Recreation Centre,43.774593,-79.2365,Athletics & Sports
4,"Regent Park, Harbourfront",43.773136,-79.239476,TD Canada Trust,43.77483,-79.241251,Bank


In [287]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,5,5,5,5,5,5
Central Bay Street,3,3,3,3,3,3
Christie,30,30,30,30,30,30
"Commerce Court, Victoria Hotel",17,17,17,17,17,17
"Garden District, Ryerson",12,12,12,12,12,12
"Harbourfront East, Union Station, Toronto Islands",30,30,30,30,30,30
"Queen's Park, Ontario Provincial Government",5,5,5,5,5,5
"Regent Park, Harbourfront",8,8,8,8,8,8
"Richmond, Adelaide, King",14,14,14,14,14,14
St. James Town,30,30,30,30,30,30


In [288]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 90 uniques categories.


### Analyse the neighbourhoods

In [289]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bagel Shop,Bakery,Bank,Bar,Beer Bar,Belgian Restaurant,Bookstore,Brewery,Bubble Tea Shop,Bus Station,Café,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Cocktail Bar,Coffee Shop,Colombian Restaurant,Comfort Food Restaurant,Concert Hall,Convenience Store,Cosmetics Shop,Department Store,Dessert Shop,Discount Store,Electronics Store,Farmers Market,Fast Food Restaurant,Fish Market,Food Court,Fried Chicken Joint,Fruit & Vegetable Store,Furniture / Home Store,Gaming Cafe,Gas Station,Gastropub,Gourmet Shop,Greek Restaurant,Grocery Store,Gym / Fitness Center,Hakka Restaurant,Hotel,Ice Cream Shop,Indian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Jazz Club,Juice Bar,Light Rail Station,Liquor Store,Lounge,Mexican Restaurant,Middle Eastern Restaurant,Movie Theater,Music Venue,Neighborhood,Noodle House,Opera House,Organic Grocery,Park,Pet Store,Pharmacy,Pizza Place,Plaza,Pool,Pub,Ramen Restaurant,Record Shop,Restaurant,Sandwich Place,Seafood Restaurant,Shopping Mall,Smoke Shop,Spa,Speakeasy,Steakhouse,Supermarket,Sushi Restaurant,Thai Restaurant,Tibetan Restaurant,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Wine Shop
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Group by neighbourhood

In [290]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [291]:
# Let's put that into a pandas dataframe
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [292]:


# Create the new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted



Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Grocery Store,Park,Shopping Mall,Bank,Concert Hall,Convenience Store,Cosmetics Shop,Department Store,Dessert Shop,Discount Store
1,Central Bay Street,Park,Convenience Store,Intersection,Wine Shop,Fast Food Restaurant,Cosmetics Shop,Department Store,Dessert Shop,Discount Store,Electronics Store
2,Christie,Greek Restaurant,Ice Cream Shop,Italian Restaurant,Yoga Studio,Restaurant,Fruit & Vegetable Store,Grocery Store,Indian Restaurant,Juice Bar,Dessert Shop
3,"Commerce Court, Victoria Hotel",Bakery,Pharmacy,Wine Shop,Brewery,Grocery Store,Liquor Store,Middle Eastern Restaurant,Music Venue,Park,Pool
4,"Garden District, Ryerson",Pizza Place,Fast Food Restaurant,Shopping Mall,Noodle House,Gas Station,Thai Restaurant,Chinese Restaurant,Fried Chicken Joint,Bank,Italian Restaurant
5,"Harbourfront East, Union Station, Toronto Islands",Café,Coffee Shop,Pizza Place,Hotel,American Restaurant,Seafood Restaurant,Colombian Restaurant,Concert Hall,Jazz Club,Opera House
6,"Queen's Park, Ontario Provincial Government",Coffee Shop,Convenience Store,Department Store,Discount Store,Bus Station,Wine Shop,Fast Food Restaurant,Cosmetics Shop,Dessert Shop,Electronics Store
7,"Regent Park, Harbourfront",Hakka Restaurant,Athletics & Sports,Thai Restaurant,Gas Station,Bakery,Bank,Caribbean Restaurant,Fried Chicken Joint,Fast Food Restaurant,Cosmetics Shop
8,"Richmond, Adelaide, King",Coffee Shop,Bagel Shop,Liquor Store,Vietnamese Restaurant,American Restaurant,Restaurant,Light Rail Station,Sushi Restaurant,Supermarket,Pub
9,St. James Town,Ramen Restaurant,Café,Sandwich Place,Coffee Shop,Arts & Crafts Store,Restaurant,Pet Store,Bubble Tea Shop,Pizza Place,Plaza


### Use K-means clustering to cluster neighbourhoods

In [293]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 1, 3, 0, 3, 3, 4, 0, 3, 3])

In [296]:


# Create a new dataframe that includes the cluster
toronto_merged = dft

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

toronto_merged.head() # check the last columns!



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toronto_merged['Cluster Labels'] = kmeans.labels_


Unnamed: 0,Postal Code,Borough,Neighbourhood,Postal Code.1,Latitude,Longitude,Cluster Labels
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",M1H,43.773136,-79.239476,2
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M1K,43.727929,-79.262029,1
13,M5B,Downtown Toronto,"Garden District, Ryerson",M1T,43.781638,-79.304302,3
22,M5C,Downtown Toronto,St. James Town,M2N,43.77012,-79.408493,0
31,M5E,Downtown Toronto,Berczy Park,M3L,43.739015,-79.506944,3


### Display clusters on folium map

In [297]:


# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

