This Notebook is a project where we will be examining different neighborhoods of Toronto by web scraping data, creating relational dataframes, and plotting information about each neighborhood on a zoomable map of the greater Toronto area.

In [5]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

Scrape the wiki page containing postal codes of Canada pertaining to Toronto using BeautifulSoup

In [6]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text
Canada_df = BeautifulSoup(source, 'lxml')
table = Canada_df.find('table', {'class': 'wikitable sortable'})
#Converts the table from the Wiki page into a usable dataframe

In [7]:
#Start building the dataframe by building a for loop to populate
table_rows = table.find_all('tr')
data = []
for row in table_rows:
    td=[]
    for t in row.find_all('td'):
        td.append(t.text.strip())
    data.append(td)
df = pd.DataFrame(data, columns=['Postalcode', 'Borough', 'Neighborhood'])
df.head(10)


Unnamed: 0,Postalcode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park / Harbourfront
6,M6A,North York,Lawrence Manor / Lawrence Heights
7,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
8,M8A,Not assigned,
9,M9A,Etobicoke,Islington Avenue


As we can see, there are some not assigned, null values, and plenty of duplicates that we need to clean up

In [8]:
#cleaning the data
df = df[~df['Borough'].isnull()]
df.drop(df[df.Borough == 'Not assigned'].index, inplace=True)
df.reset_index(drop=True, inplace=True)
df = df.groupby(['Postalcode', 'Borough'])['Neighborhood'].apply(lambda x: ','.join(x)).reset_index()
df['Neighborhood'].replace('Not assigned', df['Borough'], inplace=True)
df.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West
9,M1N,Scarborough,Birch Cliff / Cliffside West


In [9]:
df.shape

(103, 3)

We now want to pull the data from the geospatial coordinates CSV which will give us the latitude and longitude of each neighborhood in Toronto

In [11]:
path = 'https://cocl.us/Geospatial_data'
df_geo = pd.read_csv(path)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
df_geo.shape

(103, 3)

In [13]:
df_geo.rename(columns={'Postal Code': 'Postalcode'}, inplace=True)
df_geo.head()

Unnamed: 0,Postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Now we want to merge our two dataframes

In [14]:
df_merged = df.join(df_geo.set_index('Postalcode'), on='Postalcode')
df_merged.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,43.727929,-79.262029
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,43.711112,-79.284577
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,43.716316,-79.239476
9,M1N,Scarborough,Birch Cliff / Cliffside West,43.692657,-79.264848


In [16]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# k-means from clustering stage to visualize neighborhood locations
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1

In [17]:
from geopy.geocoders import Nominatim
address = 'Toronto, ON, Canada'

#We need to get initialize an instance of the geo-locator, so we will set it to tor_explorer
geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, ON, Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, ON, Canada are 43.6534817, -79.3839347.


With these coordinates, we can start building the map of our neighborhoods.

In [42]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Borough'], df_merged['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Now we would like to utilize foursquare to explore some different neighborhoods of Toronto

In [18]:
CLIENT_ID = 'GME5MSLHY0MRGQ42F1B2330DBM2CY4B2FTCY12OBVCYERREY' #Foursquare ID
CLIENT_SECRET = 'FLADA2ESTV0UC5UIUQXOCASDUOLLLY2TDAHEFL1DBI1UCJU5' #Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: GME5MSLHY0MRGQ42F1B2330DBM2CY4B2FTCY12OBVCYERREY
CLIENT_SECRET:FLADA2ESTV0UC5UIUQXOCASDUOLLLY2TDAHEFL1DBI1UCJU5


Let's choose something on the east side of Toronto

In [33]:
df_merged.loc[40, 'Neighborhood']

'East Toronto'

In [34]:
neighborhood_latitude = df_merged.loc[40, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_merged.loc[40, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_merged.loc[40, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of East Toronto are 43.685347, -79.3381065.


In [37]:
LIMIT = 20 # limit of number of venues returned by Foursquare API
radius = 1000 # define radius from the point of interest (the neighborhood we selected)
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=GME5MSLHY0MRGQ42F1B2330DBM2CY4B2FTCY12OBVCYERREY&client_secret=FLADA2ESTV0UC5UIUQXOCASDUOLLLY2TDAHEFL1DBI1UCJU5&v=20180605&ll=43.685347,-79.3381065&radius=1000&limit=20'

In [38]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ea4811fe826ac2d0dbed5cc'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Greektown',
  'headerFullLocation': 'Greektown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 98,
  'suggestedBounds': {'ne': {'lat': 43.69434700900001,
    'lng': -79.32568406780766},
   'sw': {'lat': 43.67634699099999, 'lng': -79.35052893219233}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4aeb26cff964a52009bf21e3',
       'name': 'The Only Cafe',
       'location': {'address': '972 Danforth Ave',
        'crossStreet': 'at Donlands Ave',
        'lat': 43.680408988135255,
        'lng': -79.33789819168385,
        'labeledLatLngs': [{

Lets restructure the .json file shown above into an easier to read dataframe

In [39]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#Now we clean the new data
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # normalize/flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(10)

Unnamed: 0,name,categories,lat,lng
0,The Only Cafe,Beer Bar,43.680409,-79.337898
1,Serano Bakery,Bakery,43.683139,-79.346531
2,efes bar&grill,Gastropub,43.680242,-79.338346
3,Mr. Pide,Turkish Restaurant,43.679635,-79.34153
4,Motorama Restaurant,Diner,43.679849,-79.340101
5,Sakawa Coffee,Café,43.679906,-79.339807
6,Danforth Dragon Restaurant,Asian Restaurant,43.679811,-79.339838
7,The Wren,American Restaurant,43.682467,-79.328079
8,Red Rocket Coffee,Café,43.68234,-79.32853
9,Morgan's on the Danforth,Gastropub,43.682044,-79.330255


We found a beer bar! 

This is the end of Part 1 of the notebook!