In [1]:
import pandas as pd
import numpy as np
import geocoder

## Part 1: Web Parsing Addresses from Wikipedia
#### Source: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [4]:
toronto_data = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", header=0)[0]
toronto_data.rename(columns={"Neighborhood": "Neighbourhood"}, inplace=True)
toronto_data.head() # Display first 5 rows

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [5]:
toronto_data.shape

(287, 3)

#### We ignore entries with an unassigned borough, and replace any unassigned neighbourhood values with their respective borough

In [6]:
toronto_data = toronto_data[toronto_data['Borough'] != 'Not assigned'] 

def fill_unassigned_neighbourhoods(row): # Function to fill in any unassigned neighbourhood values with their borough
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']
    return row

toronto_data.apply(fill_unassigned_neighbourhoods, axis=1)
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


#### Since more than one neighbourhood can exist in one postal code area, any of these relevant rows will be combined into one row with the neighbourhoods comma-separated as shown in the following table

In [7]:
toronto_data = toronto_data.groupby(['Postcode', 'Borough'])['Neighbourhood'].agg([('Neighbourhood', ', '.join)]).reset_index()
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
toronto_data.shape

(103, 3)

#### Then we add latitude and longitude coordinates

In [9]:
lats = []
lons = []

for postal_code in toronto_data['Postcode']:
    lat_lng_coords = None
    # loop until we get the coordinates
    while lat_lng_coords is None:
      g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    lats.append(latitude)
    lons.append(longitude)

#### Now that we have the collection of latitudes and longitudes in `lats` and `lons`, we can add them as columns to the `toronto_data` dataframe

In [10]:
toronto_data['Latitude'] = lats
toronto_data['Longitude'] = lons
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944


#### Lawrence Park is the center of Toronto, so we will grab the coordinates

In [12]:
g = geocoder.arcgis('Lawrence Park, Toronto, Ontario')
lat_lng_coords = g.latlng
center = [lat_lng_coords[0], lat_lng_coords[1]]

#### Let's visualize the data we have so far, using Folium

In [11]:
import folium

In [15]:
map_toronto = folium.Map(location=center, zoom_start=11) # Initialize the map

for index, row in toronto_data.iterrows(): # Add the neighbourhoods
    folium.Marker([row['Latitude'], row['Longitude']],
        popup=row['Neighbourhood']
    ).add_to(map_toronto)
    
map_toronto # Display the map. Note that this will not show on Github. Use a Notebook viewer online

#### Defining Foursquare API Credentials and Version

In [16]:
CLIENT_ID = '4VPWXU0OLOV2JJDGVKMQFZREWFNZUZEWIQFLH0OJTPF2MGRW'
CLIENT_SECRET = 'GWL2IKBB51O5WTLOIK4IPZI5NILY4IKN52WZA4CACSOF1WBW'
VERSION = '20200114' # Foursquare API version

#### We get the list of breakfast restaurants that are in each neighbourhood within a radius of 2 kilometers. A radius of this choice is so that the larger neighbourhoods are better covered. There will be duplicates among the smaller neighbourhoods but this will be taken care of

In [30]:
import requests
radius = 2000
LIMIT = 50
query = "breakfast"

In [31]:
# Function to accomplish this
def getNearbyBreakfastRestaurants(names, latitudes, longitudes, radius=2000):
    venues_list = []
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&query={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT, 
            query
        )
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            venue['venue']['name'], 
            venue['venue']['id'],
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']) for venue in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue Name', 
                  'Venue ID',
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return nearby_venues

In [51]:
toronto_restaurants = getNearbyBreakfastRestaurants(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )
toronto_restaurants.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue Name,Venue ID,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.811525,-79.195517,BeaverTails,4c546bef1b46c9b689e291ce,43.823376,-79.184616,Dessert Shop
1,"Rouge, Malvern",43.811525,-79.195517,Pizza Pizza,4ba6f126f964a520ee7839e3,43.806613,-79.178445,Pizza Place
2,"Rouge, Malvern",43.811525,-79.195517,Tim Hortons,4b16e23bf964a520edbe23e3,43.802,-79.198169,Coffee Shop
3,"Rouge, Malvern",43.811525,-79.195517,Tim Hortons / Esso,4e0b137722713e13018e7117,43.801863,-79.199296,Coffee Shop
4,"Rouge, Malvern",43.811525,-79.195517,Peacock Café (Toronto Zoo),4c82860fd8086dcb735c7752,43.820012,-79.181563,Café


In [52]:
toronto_restaurants.to_csv('toronto_restaurants_raw.csv') 

In [53]:
toronto_restaurants.shape

(3995, 8)

#### It is now time to filter out restaurants that aren't exactly breakfast restaurants, as well as handle any duplicates due to the 3 km radius overlap. We will remove fast food restaurants such as Tim Hortons, Subway, and Starbucks

In [54]:
toronto_restaurants['Venue Name'].value_counts().to_frame('Total Venues')

Unnamed: 0,Total Venues
Tim Hortons,345
Starbucks,233
Subway,211
Second Cup,56
No Frills,49
A&W,42
Aroma Espresso Bar,39
Loblaws,30
Pizza Pizza,28
Country Style,27


#### To remove the large fast food chain restaurants quickly, we can remove the locations that have more than 40 restaurants

In [55]:
blacklist = toronto_restaurants['Venue Name'].value_counts()[toronto_restaurants['Venue Name'].value_counts() > 40].index.tolist()
toronto_restaurants = toronto_restaurants[~(toronto_restaurants['Venue Name'].isin(blacklist))]

#### Then we remove restaurants that have words like "Pizza", "Burger", "Loblaws", "KFC", "Chicken", "Popeyes" in the name

In [56]:
blacklist = ["Pizza", "Burger", "Loblaws", "KFC", "Chicken", "Popeyes"]
toronto_restaurants = toronto_restaurants[~toronto_restaurants['Venue Name'].str.contains('|'.join(blacklist))]
toronto_restaurants['Venue Name'].value_counts().to_frame('Total Venues')

Unnamed: 0,Total Venues
Aroma Espresso Bar,39
Country Style,27
Pilot Coffee Roasters,27
Sunset Grill,25
DAVIDsTEA,24
Booster Juice,24
Jimmy's Coffee,23
Balzac's Coffee,19
Dark Horse Espresso Bar,19
Eggsmart,18


In [57]:
toronto_restaurants.shape

(2867, 8)

#### It looks like we have a decent collection of breakfast restaurants! Let us check and handle duplicate restaurant locations across the neighbourhoods now

In [58]:
(toronto_restaurants['Venue ID'].value_counts().to_frame('Total Duplicates')['Total Duplicates'] - 1).to_frame('Total Duplicates')

Unnamed: 0,Total Duplicates
5aff06ca6e4650002cc6286b,13
4fff1f96e4b042ae8acddca5,13
50322b6ae4b09116a296568c,13
53c524bd498efaeebf73b291,11
537773d1498e74a75bb75c1e,11
514627d1e4b0dba1b85e9ba8,11
5b6c842bc36588002c80a934,11
59cd51c71b0ea516e9e7b3aa,11
54132b3b498ee9ca9332e189,10
5894c1f9266c1121f0a757d1,10


#### The highest number of duplicates for a restaurant is 13. This number is not very large. If we remove duplicates then we risk bias towards venues that are at the "top of the list" in the restaurants dataframe. I think it is best to not remove duplicates for now

#### Time to export the cleaned dataframe

In [59]:
toronto_restaurants.to_csv('toronto_restaurants_processed.csv') 

#### What is the number of breakfast restaurants per neighbourhood?

In [67]:
neighbourhood_restaurants = toronto_restaurants['Neighbourhood'].value_counts().to_frame().reset_index()
neighbourhood_restaurants.columns = ['Neighbourhood', 'Total Breakfast Restaurants']
neighbourhood_restaurants

Unnamed: 0,Neighbourhood,Total Breakfast Restaurants
0,Queen's Park,61
1,"Little Portugal, Trinity",50
2,Christie,49
3,"Cabbagetown, St. James Town",49
4,"The Annex, North Midtown, Yorkville",48
5,"Brockton, Exhibition Place, Parkdale Village",48
6,"Parkdale, Roncesvalles",48
7,"Chinatown, Grange Park, Kensington Market",48
8,Rosedale,48
9,"First Canadian Place, Underground city",47


#### It should be no surprise that Queen's Park has the largest number of breakfast restaurants (61) within a 2 km radius. It is an urban park centralized in Downtown Toronto. It also enclaves the University of Toronto

#### Let's map out the distribution of these restaurants using a heatmap. We will draw 3 circles, indicating a 1, 2, and 3 km radius from the center of Toronto

In [77]:
from folium import plugins
from folium.plugins import HeatMap

In [97]:
breakfast_latlons = toronto_restaurants[['Venue Latitude', 'Venue Longitude']].values

map_toronto = folium.Map(location=center, zoom_start=11)
folium.TileLayer('cartodbpositron').add_to(map_toronto) 
HeatMap(breakfast_latlons).add_to(map_toronto)
folium.Marker(center).add_to(map_toronto)
folium.Circle(center, radius=1000, fill=False, color='white').add_to(map_toronto)
folium.Circle(center, radius=2000, fill=False, color='white').add_to(map_toronto)
folium.Circle(center, radius=3000, fill=False, color='white').add_to(map_toronto)
map_toronto

#### We can see that there is a higher density of breakfast restaurants south of the center of Toronto. There is a lower density to the west and north however, indicating that there might be an unmet breakfast demand in these areas

#### Restaurants seem to be located in Downtown Toronto and this is likely because of the amount of traffic in this location
### Let us now look at traffic data

![Traffic](Traffic.png)

In [106]:
traffic_data = pd.read_excel('traffic-signal-vehicle-and-pedestrian-volumes-data.xlsx', sheet_name=0)
print(traffic_data.shape)
traffic_data.head()

(2280, 11)


Unnamed: 0,TCS #,Main,Midblock Route,Side 1 Route,Side 2 Route,Activation Date,Latitude,Longitude,Count Date,8 Peak Hr Vehicle Volume,8 Peak Hr Pedestrian Volume
0,2,JARVIS ST,,FRONT ST E,,11/15/1948,43.649418,-79.371446,2017-06-21,15662,13535
1,3,KING ST E,,JARVIS ST,,08/23/1950,43.650461,-79.371924,2016-09-17,12960,7333
2,4,JARVIS ST,,ADELAIDE ST E,,09/12/1958,43.651534,-79.37236,2016-11-08,17770,7083
3,5,JARVIS ST,,RICHMOND ST E,,04/21/1962,43.652718,-79.372824,2015-12-08,19678,4369
4,6,JARVIS ST,,QUEEN ST E,,08/24/1928,43.653704,-79.373238,2016-09-17,14487,3368


#### We will sum the `8 Peak Hr Vehicle Volume` and `8 Peak Hr Pedestrian Volume` columns into one column `Volume`

In [107]:
traffic_data['Volume'] = traffic_data['8 Peak Hr Vehicle Volume'].fillna(0) + traffic_data['8 Peak Hr Pedestrian Volume'].fillna(0)
traffic_data.head()

Unnamed: 0,TCS #,Main,Midblock Route,Side 1 Route,Side 2 Route,Activation Date,Latitude,Longitude,Count Date,8 Peak Hr Vehicle Volume,8 Peak Hr Pedestrian Volume,Volume
0,2,JARVIS ST,,FRONT ST E,,11/15/1948,43.649418,-79.371446,2017-06-21,15662,13535,29197
1,3,KING ST E,,JARVIS ST,,08/23/1950,43.650461,-79.371924,2016-09-17,12960,7333,20293
2,4,JARVIS ST,,ADELAIDE ST E,,09/12/1958,43.651534,-79.37236,2016-11-08,17770,7083,24853
3,5,JARVIS ST,,RICHMOND ST E,,04/21/1962,43.652718,-79.372824,2015-12-08,19678,4369,24047
4,6,JARVIS ST,,QUEEN ST E,,08/24/1928,43.653704,-79.373238,2016-09-17,14487,3368,17855


#### In determining the overall street traffic volume near a neighbourhood, we can use scikit-learn's [Radius Neighbours Regressor (RNR)](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.RadiusNeighborsRegressor.html) using a training set and test set to build a map of traffic volume based on how busy the nearest roads are

In [121]:
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.model_selection import train_test_split
import plotly.graph_objs as go
import plotly.offline as py

In [152]:
traffic_X = traffic_data[['Longitude', 'Latitude']].values
traffic_y = traffic_data['Volume'].values

scores = [] # to track the scores of each of the 100 RNRs

for rad in range(100, 5100, 100): # radius values from 100 m to 5000 m in 100 m increments
    radius = rad / 111111 # rad divided by 111111 m/degree to convert to lat/lon
    traffic_X_train, traffic_X_test, traffic_y_train, traffic_y_test = train_test_split(traffic_X, traffic_y, test_size=0.4, random_state=100)
    traffic_neigh = RadiusNeighborsRegressor(radius=radius)
    traffic_neigh.fit(traffic_X_train, traffic_y_train)
    scores.append(traffic_neigh.score(traffic_X_test, traffic_y_test))

#### Plot of the scores

In [153]:
rnr_r_scatter = go.Scatter(x = list(range(100, 5100, 100)), y = scores)
py.iplot([rnr_r_scatter])

#### What is the best one?

In [160]:
print("The best score is: " + str(max(scores)))
index_of_max = scores.index(max(scores))
print("With a radius of: " + str(list(range(100, 5100, 100))[index_of_max]))

The best score is: 0.08566896473330354
With a radius of: 2800


#### This R^2 score is obviously not the best, but let's see what we can map out with it

In [None]:
traffic_neigh = RadiusNeighborsRegressor(radius = 2800 / 111111)
traffic_neigh.fit(traffic_X, traffic_y)