In [1]:
# import all libraries
import requests
import warnings
warnings.filterwarnings("ignore")
import os
import codecs
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

### Part 2: Working with Geographical data 

This is part 2 of the assessment we are still using our Web scraping pipeline,so we add all the web scraping code to a script which we import.

In [2]:
import webscraping as ws

In [3]:
wikipage = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

#### Call the function, get_wikipedia and convert_table from the pipeline code and pass through wikipedia page once again

In [4]:
load_html_page = ws.get_wikipedia(wikipage)
postal_codes_df = ws.convert_table(load_html_page, return_df=True).replace(to_replace='Not assigned', value=np.nan)
postal_codes_df['Neighbourhood'] = postal_codes_df['Neighbourhood'].fillna(postal_codes_df["Borough"])
postal_codes_df = postal_codes_df.dropna(axis=0)

#### Working with only boroughs data that contains the word "Toronto" 

In [5]:
postcodes_df = postal_codes_df[postal_codes_df.Borough.str.contains('Toronto')].copy().reset_index(drop=True)

#### Load the geographical coordinates data and link them with the right Postcode

In [6]:
geographical_data = (pd.read_csv('Geospatial_Coordinates.csv', index_col='Postal Code')
              .pipe(postcodes_df.merge, right_index=True, left_on='Postcode'))

In [7]:
geographical_data.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
5,M4E,East Toronto,The Beaches,43.676357,-79.293031
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M6G,Downtown Toronto,Christie,43.669542,-79.422564
9,M5H,Downtown Toronto,Adelaide,43.650571,-79.384568


### Lets start working with geo data

In [8]:
%matplotlib inline
import json # library to handle JSON files

from geopy.geocoders import Nominatim 

import requests 
from pandas.io.json import json_normalize 

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium 

In [9]:
addr = 'Toronto, ON'

geolocator_obj = Nominatim()
location = geolocator_obj.geocode(addr)
lat = location.latitude
lng = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(lat, lng))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [10]:
# Create map of Toronto
map_toronto = folium.Map(location=[lat, lng], zoom_start=11)

# add markers to map
for lat, lng, label in zip(geographical_data['Latitude'], geographical_data['Longitude'], geographical_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
# View map    
map_toronto

In [11]:
CLIENT_ID = 'K2KHBXRAX3YPPPOCELWV5JWPFGF4IAV1JAFJYYJNVQJ1WONH' # your Foursquare ID
CLIENT_SECRET = 'BGOM4RAMKJ5Q0KQ1AQ1EKOUHYDR5QWBO1ETDTDKPQVUNBBK0' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: K2KHBXRAX3YPPPOCELWV5JWPFGF4IAV1JAFJYYJNVQJ1WONH
CLIENT_SECRET:BGOM4RAMKJ5Q0KQ1AQ1EKOUHYDR5QWBO1ETDTDKPQVUNBBK0


In [12]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [13]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    LIMIT = 50
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
toronto_venues = getNearbyVenues(names=geographical_data['Neighbourhood'],
                                   latitudes=geographical_data['Latitude'],
                                   longitudes=geographical_data['Longitude']
                                  )

Harbourfront
Regent Park
Ryerson
Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide
King
Richmond
Dovercourt Village
Dufferin
Harbourfront East
Toronto Islands
Union Station
Little Portugal
Trinity
The Danforth West
Riverdale
Design Exchange
Toronto Dominion Centre
Brockton
Exhibition Place
Parkdale Village
The Beaches West
India Bazaar
Commerce Court
Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North
Forest Hill West
High Park
The Junction South
North Toronto West
The Annex
North Midtown
Yorkville
Parkdale
Roncesvalles
Davisville
Harbord
University of Toronto
Runnymede
Swansea
Moore Park
Summerhill East
Chinatown
Grange Park
Kensington Market
Deer Park
Forest Hill SE
Rathnelly
South Hill
Summerhill West
CN Tower
Bathurst Quay
Island airport
Harbourfront West
King and Spadina
Railway Lands
South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown
St. James Town
First Canadian Place
Underground city


#### Sanity check? View the first 5 rows and the dimension of the new dataframe.

In [15]:
toronto_venues.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Harbourfront,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Harbourfront,43.65426,-79.360636,Toronto Cooper Koo Family Cherry St YMCA Centre,43.653191,-79.357947,Gym / Fitness Center
3,Harbourfront,43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
4,Harbourfront,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


In [16]:
print(toronto_venues.shape)

(2255, 7)


In [17]:
print('Number of unique categories: {}.'.format(len(toronto_venues['Venue Category'].unique())))

Number of unique categories: 215.


#### One hot encoding of the Venue Category

In [18]:
toronto_onehot_vector = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot_vector['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# swap neighbourhood column with the first column
fixed_columns = [toronto_onehot_vector.columns[-1]] + list(toronto_onehot_vector.columns[:-1])
toronto_onehot_vector = toronto_onehot_vector[fixed_columns]

toronto_onehot_vector.head()

Unnamed: 0,Neighbourhood,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Transportation Service,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Print shape of the one hot vector

In [19]:
print(toronto_onehot_vector.shape)

(2255, 216)


#### Group rows by neighbourhood and by taking the average of the frequency of occurrence for each category

In [20]:
toronto_grouped = toronto_onehot_vector.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Transportation Service,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0
1,Bathurst Quay,0.0,0.071429,0.071429,0.071429,0.142857,0.142857,0.142857,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0
3,Brockton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824


In [21]:
print(toronto_grouped.shape)

(73, 216)


#### Let's print each neighbourhood along with the top 5 most common venues

In [22]:
num_top_venues = 5
for hood in toronto_grouped['Neighbourhood']:
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','frequency']
    temp = temp.iloc[1:]
    temp['frequency'] = temp['frequency'].astype(float)
    temp = temp.round({'frequency': 2})

#### Throw the results in a dataframe, from highest to lowest venue.

In [23]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


#### Display the top 10 for each neighbourhood

In [24]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
# View the top 5 rows
neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Steakhouse,Café,Hotel,Gastropub,American Restaurant,Pizza Place,Breakfast Spot,Restaurant,Asian Restaurant
1,Bathurst Quay,Airport Terminal,Airport Lounge,Airport Service,Harbor / Marina,Boat or Ferry,Sculpture Garden,Plane,Boutique,Airport Gate,Airport
2,Berczy Park,Coffee Shop,Cocktail Bar,Restaurant,Cheese Shop,Café,Farmers Market,Seafood Restaurant,Bakery,Steakhouse,Beach
3,Brockton,Coffee Shop,Breakfast Spot,Café,Nightclub,Restaurant,Burrito Place,Climbing Gym,Furniture / Home Store,Stadium,Falafel Restaurant
4,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Auto Workshop,Smoke Shop,Park,Spa,Farmers Market,Fast Food Restaurant,Brewery,Burrito Place
