# Download Toronto Neighboorhood Information from Wikipedia and format it accordingly

### Installing, importing necessary packages

In [1]:
# Install as necessary
### pip install arcgis
from arcgis.gis import GIS
from arcgis.geocoding import get_geocoders, batch_geocode

# Import packages
import pandas as pd
import numpy as np

import requests # library to handle requests

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

  pd.datetime,


### Import data from wikipedia and drop rows from dataframe where 'borough' is equal to 'Not assigned'

In [2]:
# Wikipedia file URL
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Using pandas to load data from wikidpedia page
df_CA=pd.read_html(url,header=0)[0]
df_CA.replace('Not assigned',np.NaN,inplace=True)
df_CA.dropna(subset=['Borough'],inplace=True)

### Find coordinates for postal codes and add onto current dataset
#### Using arcgis geocoder for a batch pull of coordinates

In [3]:
# Getting postal codes from dataframe and storing into a list
postal_codes=df_CA["Postal code"].tolist()

In [4]:
# Set geocode setting for arcgis - sensitive info 
# withheld (i.e., username and password)
gis = GIS("http://www.arcgis.com", "User_Name", "Pass_Word")

## use the first of GIS's configured geocoders
geocoder=get_geocoders(gis)[0]
CA_geo=batch_geocode(postal_codes,source_country="CA")

### Use for loop to goes postal codes and coordinates - then stores it into a dataframe

In [5]:
## For loop to collect coordinates
Postal_code=[]
Lat=[]
Long=[]
for i in range(len(CA_geo)):
    Postal_code.append(CA_geo[i]['address'])
    Lat.append(CA_geo[i]['location']['y'])
    Long.append(CA_geo[i]['location']['x'])

# Convert collected data into a dataframe
df_Postal_code=pd.DataFrame(Postal_code,columns=['Postal code'])
df_Lat=pd.DataFrame(Lat,columns=['Latitude'])
df_Long=pd.DataFrame(Long,columns=['Longitude'])

# Merge tables to include postal codes and their corrsponding coordinates
df_CA_coords=pd.merge(df_Lat,df_Long,left_index=True,right_index=True)
df_CA_coords_code=pd.merge(df_Postal_code,df_CA_coords,left_index=True,right_index=True)

# Merge tables to include postal codes, boroughs, neighborhoods & coordinates (longitude & Latitude)
df_CA_geo=pd.merge(df_CA,df_CA_coords_code)

# Stored as CSV to save lookup credits during the code building process
df_CA_geo.to_csv('df_CA_geo.csv', sep='\t', encoding='utf-8')

In [6]:
# Read from saved csv file and store into dataframe
df_CA_geo=pd.read_csv('df_CA_geo.csv', sep='\t')
df_CA_geo.drop('Unnamed: 0', axis=1,inplace=True)

### Map neighborhoods by Toronto onto map and cluster boroughs

In [7]:
# Get coordinates (latitude, longtitude) for Toronto
address='Toronto, CA'
geolocator = Nominatim(user_agent="CA_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# Filter boroughs for 'Toronto' from CA postal codes
toronto_geo = df_CA_geo[df_CA_geo['Borough'].str.contains('Toronto')].reset_index(drop=True)

#### Create a map of Toronto with neighborhoods superimposed on top.

In [8]:
# Create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# Add markers to map
for df_Lat, df_Long, borough, neighborhood in zip(toronto_geo['Latitude'], toronto_geo['Longitude'], toronto_geo['Borough'], toronto_geo['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [df_Lat, df_Long],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

# Map of Canada's postal codes for boroughs in Toronto    
map_toronto

#### Examining popular venues within the Toronto region

In [9]:
# Preview dataframe w/ Toronto's neighborhoods & coordinates (latitude & longitude)
toronto_geo.head(10)

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,Regent Park / Harbourfront,43.650964,-79.353041
1,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.66179,-79.38939
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529
3,M5C,Downtown Toronto,St. James Town,43.651734,-79.375554
4,M4E,East Toronto,The Beaches,43.678148,-79.295349
5,M5E,Downtown Toronto,Berczy Park,43.645196,-79.373855
6,M5G,Downtown Toronto,Central Bay Street,43.656072,-79.385653
7,M6G,Downtown Toronto,Christie,43.668602,-79.420387
8,M5H,Downtown Toronto,Richmond / Adelaide / King,43.650542,-79.384116
9,M6H,West Toronto,Dufferin / Dovercourt Village,43.66491,-79.438664


#### Define Foursquare Credentials, Version and Limit Parameter

In [10]:
CLIENT_ID = 'B3WEP1QRUXRIZQSZWGWO1JLR2P5XT1513G4K0ZLJ4AYAAZ12' # your Foursquare ID
CLIENT_SECRET = 'REIU1MYR5KK4O1033IKMEG40YOUTCEBGBJNGH3FLSZVH4PSJ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

### Gather information from fourspace using function

In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    # Store into dataframe
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Run the  function on each neighborhood and create a new dataframe called 'toronto_venues'.

In [12]:
toronto_venues = getNearbyVenues(toronto_geo['Neighborhood'],
                                   toronto_geo['Latitude'],
                                   toronto_geo['Longitude']
                                  )

Regent Park / Harbourfront
Queen's Park / Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond / Adelaide / King
Dufferin / Dovercourt Village
Harbourfront East / Union Station / Toronto Islands
Little Portugal / Trinity
The Danforth West / Riverdale
Toronto Dominion Centre / Design Exchange
Brockton / Parkdale Village / Exhibition Place
India Bazaar / The Beaches West
Commerce Court / Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
High Park / The Junction South
North Toronto West
The Annex / North Midtown / Yorkville
Parkdale / Roncesvalles
Davisville
University of Toronto / Harbord
Runnymede / Swansea
Moore Park / Summerhill East
Kensington Market / Chinatown / Grange Park
Summerhill West / Rathnelly / South Hill / Forest Hill SE / Deer Park
CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst  Quay / South Niagara / Island airport
Rosed

In [13]:
# Shape and preview of resulting dataframe
print(toronto_venues.shape)
toronto_venues.head()

(1672, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Regent Park / Harbourfront,43.650964,-79.353041,Souk Tabule,43.653756,-79.35439,Mediterranean Restaurant
1,Regent Park / Harbourfront,43.650964,-79.353041,Young Centre for the Performing Arts,43.650825,-79.357593,Performing Arts Venue
2,Regent Park / Harbourfront,43.650964,-79.353041,SOMA chocolatemaker,43.650622,-79.358127,Chocolate Shop
3,Regent Park / Harbourfront,43.650964,-79.353041,Cluny Bistro & Boulangerie,43.650565,-79.357843,French Restaurant
4,Regent Park / Harbourfront,43.650964,-79.353041,BATLgrounds,43.647088,-79.351306,Athletics & Sports


In [14]:
# Store to csv file - use for saving on location lookups during code building
toronto_venues.to_csv('df_toronto_venues.csv',sep='\t',encoding='utf-8')

In [15]:
# Read stored csv file, drop first column and convert to dataframe
df_toronto_venues=pd.read_csv('df_toronto_venues.csv',sep='\t')
df_toronto_venues.drop('Unnamed: 0', axis=1,inplace=True)
pd.DataFrame(df_toronto_venues)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Regent Park / Harbourfront,43.650964,-79.353041,Souk Tabule,43.653756,-79.354390,Mediterranean Restaurant
1,Regent Park / Harbourfront,43.650964,-79.353041,Young Centre for the Performing Arts,43.650825,-79.357593,Performing Arts Venue
2,Regent Park / Harbourfront,43.650964,-79.353041,SOMA chocolatemaker,43.650622,-79.358127,Chocolate Shop
3,Regent Park / Harbourfront,43.650964,-79.353041,Cluny Bistro & Boulangerie,43.650565,-79.357843,French Restaurant
4,Regent Park / Harbourfront,43.650964,-79.353041,BATLgrounds,43.647088,-79.351306,Athletics & Sports
...,...,...,...,...,...,...,...
1667,Business reply mail Processing CentrE,43.648700,-79.385450,Lavish & Squalor,43.650304,-79.388927,Clothing Store
1668,Business reply mail Processing CentrE,43.648700,-79.385450,The Gabardine,43.650988,-79.381225,American Restaurant
1669,Business reply mail Processing CentrE,43.648700,-79.385450,Indigospirit,43.648350,-79.380347,Bookstore
1670,Business reply mail Processing CentrE,43.648700,-79.385450,Ki Modern Japanese + Bar,43.647167,-79.379608,Japanese Restaurant


### Analyze Venues by Neighborhood

In [16]:
# One hot encoding for venue category
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep=" ")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood']=toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

# Group rows by neighborhood and taking the mean of the frequency of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [17]:
# Printing out each neighborhood along with the top 5 most common venues
tor_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(tor_top_venues))
    print('\n')

----Berczy Park----
                 venue  freq
0          Coffee Shop  0.11
1           Restaurant  0.05
2   Seafood Restaurant  0.05
3         Cocktail Bar  0.05
4               Bakery  0.03


----Brockton / Parkdale Village / Exhibition Place----
                     venue  freq
0              Coffee Shop  0.09
1                     Café  0.07
2   Thrift / Vintage Store  0.05
3               Restaurant  0.05
4                Gift Shop  0.05


----Business reply mail Processing CentrE----
                 venue  freq
0          Coffee Shop  0.10
1                Hotel  0.04
2                  Bar  0.04
3           Restaurant  0.04
4   Seafood Restaurant  0.03


----CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst  Quay / South Niagara / Island airport----
                   venue  freq
0            Coffee Shop  0.07
1                   Café  0.06
2             Restaurant  0.06
3      French Restaurant  0.04
4   Gym / Fitness Center  0.04


----Central Bay S

In [18]:
# Function that sorts the venues in descending order
def return_most_common_venues(row, tor_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:tor_top_venues]

Put into a new dataframe and display the top 10 venues for each neighborhood

In [19]:
# Set number of top venues to search for
tor_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns=['Neighborhood']
for ind in np.arange(tor_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
toronto_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
toronto_neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    toronto_neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], tor_top_venues)

toronto_neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Restaurant,Seafood Restaurant,Bakery,Lounge,Cheese Shop,Farmers Market,Breakfast Spot,Beer Bar
1,Brockton / Parkdale Village / Exhibition Place,Coffee Shop,Café,Gift Shop,Restaurant,Thrift / Vintage Store,Accessories Store,North Indian Restaurant,Caribbean Restaurant,Sandwich Place,Chiropractor
2,Business reply mail Processing CentrE,Coffee Shop,Bar,Restaurant,Hotel,American Restaurant,Seafood Restaurant,Pub,Café,Italian Restaurant,Tea Room
3,CN Tower / King and Spadina / Railway Lands / ...,Coffee Shop,Café,Restaurant,Park,Gym / Fitness Center,French Restaurant,Lounge,Bar,Italian Restaurant,Speakeasy
4,Central Bay Street,Coffee Shop,Clothing Store,Japanese Restaurant,Thai Restaurant,Spa,Bookstore,Bubble Tea Shop,Sandwich Place,Restaurant,Sushi Restaurant


### Cluster Neighborhoods
#### Run k-means to cluster the neighborhood into 5 clusters and analyze

In [20]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering model
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

## Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

# add clustering labels
toronto_neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# Using Toronto neighborhood data and storing it into a dataframe to analyze venue data from within each neighborhood
toronto_merged = toronto_geo

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(toronto_neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

##### Map Resulting Clusters

In [21]:
# Create map with clusters
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

Cluster 1

In [22]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,East Toronto,0,Business Service,Park,Discount Store,Grocery Store,Bus Line,Farm,Fountain,Food Truck,Food Court,Food & Drink Shop
21,Central Toronto,0,Home Service,Park,Ethiopian Restaurant,Food Truck,Food Court,Food & Drink Shop,Food,Flower Shop,Fish Market,Fish & Chips Shop
23,Central Toronto,0,Playground,Garden,Park,Gym Pool,Elementary School,Food Court,Food & Drink Shop,Food,Flower Shop,Fish Market
33,Downtown Toronto,0,Playground,Park,Candy Store,Grocery Store,Yoga Studio,Ethiopian Restaurant,Food Court,Food & Drink Shop,Food,Flower Shop


Cluster 2

In [23]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,1,Pub,Coffee Shop,Music Venue,Café,Athletics & Sports,Bank,Chocolate Shop,Seafood Restaurant,French Restaurant,Tech Startup
1,Downtown Toronto,1,Coffee Shop,Café,Yoga Studio,Bookstore,Italian Restaurant,Juice Bar,Fried Chicken Joint,Middle Eastern Restaurant,Distribution Center,Discount Store
2,Downtown Toronto,1,Coffee Shop,Clothing Store,Middle Eastern Restaurant,Cosmetics Shop,Sandwich Place,Café,Hotel,Tanning Salon,Diner,Japanese Restaurant
3,Downtown Toronto,1,Coffee Shop,Café,Cosmetics Shop,Restaurant,Diner,Bakery,Japanese Restaurant,Italian Restaurant,Cocktail Bar,American Restaurant
4,East Toronto,1,Coffee Shop,Church,Health Food Store,Neighborhood,Trail,Pub,Farmers Market,Falafel Restaurant,Farm,Yoga Studio
5,Downtown Toronto,1,Coffee Shop,Cocktail Bar,Restaurant,Seafood Restaurant,Bakery,Lounge,Cheese Shop,Farmers Market,Breakfast Spot,Beer Bar
6,Downtown Toronto,1,Coffee Shop,Clothing Store,Japanese Restaurant,Thai Restaurant,Spa,Bookstore,Bubble Tea Shop,Sandwich Place,Restaurant,Sushi Restaurant
7,Downtown Toronto,1,Café,Grocery Store,Coffee Shop,Candy Store,Athletics & Sports,Baby Store,Playground,Park,Food Court,Food & Drink Shop
8,Downtown Toronto,1,Coffee Shop,Restaurant,Café,Clothing Store,Thai Restaurant,Gym,Bakery,Bookstore,Sushi Restaurant,Lounge
9,West Toronto,1,Park,Pharmacy,Gym / Fitness Center,Furniture / Home Store,Brazilian Restaurant,Grocery Store,Bank,Bakery,Gym,Café


Cluster 3

In [24]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,Central Toronto,2,Home Service,Spa,Ethiopian Restaurant,Food Truck,Food Court,Food & Drink Shop,Food,Flower Shop,Fish Market,Fish & Chips Shop


Cluster 4

In [25]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Downtown Toronto,3,Harbor / Marina,Theme Park,Park,Farm,Yoga Studio,Ethiopian Restaurant,Food Truck,Food Court,Food & Drink Shop,Food


Cluster 5

In [26]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,Central Toronto,4,Bus Line,Swim School,Lawyer,Yoga Studio,Fountain,Food Truck,Food Court,Food & Drink Shop,Food,Flower Shop


# Fin
##### 'Fin' is 'Finish' in spanish

### Comments and feedback welcome!