Import the data from Wikipedia table

In [1]:
import pandas as pd
from pandas.io.html import read_html
page='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
extract=read_html(page,attrs={'class':'wikitable'})
df=extract[0]

The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [2]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


drop all postcodes with Borough not assigned

In [3]:
a= df[df['Borough']=='Not assigned'].index
df.drop(a,axis=0,inplace=True)
df.shape

(103, 3)

all other points requested should not be executed as the table is already in the format we need

In [4]:
df.reset_index(drop=True,inplace=True)
df.head()


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


check if there are duplicated values in the table and remove them

In [14]:
df['Neighborhood'].value_counts().head()

Downsview                                 4
Don Mills                                 2
Wexford, Maryvale                         1
Parkdale, Roncesvalles                    1
Rouge Hill, Port Union, Highland Creek    1
Name: Neighborhood, dtype: int64

In [17]:
df=df[df['Neighborhood']!='Downsview']
df=df[df['Neighborhood']!='Don Mills']
df.shape

(97, 3)

In [18]:
df['Neighborhood'].value_counts().head()

Wexford, Maryvale                         1
First Canadian Place, Underground city    1
Davisville                                1
Victoria Village                          1
Rouge Hill, Port Union, Highland Creek    1
Name: Neighborhood, dtype: int64

In [19]:
#find the lat and long for Toronto to be used later as input for the map
from  geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent='toronto_an')
location = geolocator.geocode('Toronto,Ontario')

print(location.address)
print((location.latitude, location.longitude))


Toronto, Golden Horseshoe, Ontario, M5H 2N2, Canada
(43.6534817, -79.3839347)


In [20]:
#Import excel file to retrieve geo-data for different areas
df_cord=pd.read_csv('https://cocl.us/Geospatial_data')
df_cord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
#Merge the two datasets
df2 = pd.merge(df, df_cord, how='left', on=['Postal Code'])
df2.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [23]:
#Install folium for creating the map
!conda install -c conda-forge folium=0.5.0 
import folium

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                       

In [24]:
#Create the Toronto map and add all the Neighborhood
Toronto_map = folium.Map(location = [location.latitude, location.longitude], 
                                        zoom_start = 12 ) 

Toronto_map

In [25]:
for a,b in zip(df2['Latitude'],df2['Longitude']):
    folium.CircleMarker(
        [a,b],
        radius=5,
        popup='a',
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(Toronto_map)  
Toronto_map

In [104]:

# @hidden_cell

CLIENT_ID = 'GRJCDSUXZY0WZQIN40I4HYTHYYM2BMUEBNWNAYBST5KU3P44' # your Foursquare ID
CLIENT_SECRET = 'NUZPIUVBH25GQ0RN3V4AU5DHLQMB2WQWLXJCAO3H4WPQZBSE' # your Foursquare Secret
VERSION = '20161225' # Foursquare API version


In [28]:
# define limit = 5 (limit to 5 venues only) & radius = 500 (meters)
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframeLIMIT = 5
radius = 500
LIMIT=5

location_list = [] # initiate a list to store data from Foursquare API requests

for neighbourhood, latitude, longitude in zip(df2['Neighborhood'], df2['Latitude'], df2['Longitude']):
    
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
    
    data=requests.get(url).json()
    
# use len() to check if any data within "items" (len = 0 if nothing)
    length = len(data['response']['groups'][0]['items'])
    if length == 0:
        continue # skip the row if nothing is found
    else:
        venue = data['response']['groups'][0]['items'][0]['venue']
        
        # extract info within 'venue'
        name = venue['name']
        lat = venue['location']['lat']
        lon = venue['location']['lng']
        cat = venue['categories'][0]['name']
        
        location_list.append([(neighbourhood, latitude, longitude, name, lat, lon, cat)])

In [30]:
location_list[0:2]

[[('Parkwoods',
   43.7532586,
   -79.3296565,
   'Brookbanks Park',
   43.751976046055574,
   -79.33214044722958,
   'Park')],
 [('Victoria Village',
   43.725882299999995,
   -79.31557159999998,
   'Victoria Village Arena',
   43.72348055545508,
   -79.31563520925143,
   'Hockey Arena')]]

In [80]:
# create data frame based on 'Location_list'

tb = pd.DataFrame(x for row in location_list for x in row)
tb.columns = ['Neighbourhood','N_Latitude','N_Longitude','Venue','V_Latitude','V_Longitude','category']
tb.head()

Unnamed: 0,Neighbourhood,N_Latitude,N_Longitude,Venue,V_Latitude,V_Longitude,category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
2,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
3,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,Roots,43.718214,-79.463893,Boutique
4,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,Queen's Park,43.663946,-79.39218,Park


In [88]:
#One hot encoding
categories= pd.get_dummies(tb.category) # one hot encoding with get_dummies()
tb_01 = pd.concat([tb[['Neighbourhood','N_Latitude','N_Longitude']], categories], axis=1) # combine neighbourhood & category tables
tb_01.head()

Unnamed: 0,Neighbourhood,N_Latitude,N_Longitude,Airport,Arts & Crafts Store,Bakery,Bank,Bar,Baseball Field,Boutique,...,Skating Rink,Sports Bar,Summer Camp,Supermarket,Theme Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Warehouse Store,Yoga Studio
0,Parkwoods,43.753259,-79.329656,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Victoria Village,43.725882,-79.315572,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",43.65426,-79.360636,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Clustering neighborhoods

In [89]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
tb_02=tb_01.drop(['Neighbourhood'],axis=1)
tb_02=tb_02.drop(['N_Latitude'],axis=1)
tb_02=tb_02.drop(['N_Longitude'],axis=1)
n_group = 6 # we will group neighbourhoods into 6 clusters

# run k-means clustering
kmeans = KMeans(n_clusters=n_group, random_state=0).fit(tb_02)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([5, 0, 4, 0, 5, 0, 0, 0, 4, 0], dtype=int32)

In [90]:
#Insert the clusters in the original dataframe
tb_01.insert(1,'clusters',kmeans.labels_)


In [91]:
tb_01.head()

Unnamed: 0,Neighbourhood,clusters,N_Latitude,N_Longitude,Airport,Arts & Crafts Store,Bakery,Bank,Bar,Baseball Field,...,Skating Rink,Sports Bar,Summer Camp,Supermarket,Theme Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Warehouse Store,Yoga Studio
0,Parkwoods,5,43.753259,-79.329656,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Victoria Village,0,43.725882,-79.315572,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",4,43.65426,-79.360636,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lawrence Manor, Lawrence Heights",0,43.718518,-79.464763,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Queen's Park, Ontario Provincial Government",5,43.662301,-79.389494,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

cluster_map = folium.Map(location = [location.latitude, location.longitude], 
                                        zoom_start = 12 ) 
# set color scheme for the clusters
rainbow=['red','blue','yellow','orange','green','white']

for a,b,c,d in zip(tb_01['Neighbourhood'],tb_01['clusters'],tb_01['N_Latitude'],tb_01['N_Longitude']):
    folium.CircleMarker(
        [c,d],
        radius=5,
        popup='a',
        color=rainbow[b],
        fill=True,
        fill_color=rainbow[b],
        fill_opacity=0.7,
        parse_html=False).add_to(cluster_map)  
cluster_map

In [99]:
rainbow

['#8000ff', '#1996f3', '#4df3ce', '#b2f396', '#ff964f', '#ff0000']