# Import Relevant Libraries for use

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Libraries imported.


# Download and clean data with Postal Codes

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M" 
data = pd.read_html(url, header = 0)
data2 = data[0]
NA = data2[ data2['Borough'] == "Not assigned" ].index
 
# Delete these row from dataFrame
data2.drop(NA , inplace=True)
data2.reset_index(drop=True, inplace=True)
data2.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


# Read CSV data with Latitude and Logitude Data
### Merge both Data frames using Postal Code as a Unique identifyer

In [4]:
geo_data = pd.read_csv("http://cocl.us/Geospatial_data")
df_merged = pd.merge(data2, geo_data, how = "inner", on = "Postal Code")
df_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [6]:
df_merged.columns

Index(['Postal Code', 'Borough', 'Neighborhood', 'Latitude', 'Longitude'], dtype='object')

In [7]:
df = df_merged[["Borough", "Neighborhood", "Latitude", "Longitude"]]
df.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,North York,Parkwoods,43.753259,-79.329656
1,North York,Victoria Village,43.725882,-79.315572
2,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Create map of Toronto with Neighborhoods superimposed on it

In [8]:
address = 'Toronto'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [10]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [11]:
Central_Toronto = df[df['Borough'] == 'Central Toronto'].reset_index(drop=True)
Central_Toronto.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Central Toronto,Lawrence Park,43.72802,-79.38879
1,Central Toronto,Roselawn,43.711695,-79.416936
2,Central Toronto,Davisville North,43.712751,-79.390197
3,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307
4,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678


In [13]:
address = 'Central Toronto'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Central Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Central Toronto are 43.6534817, -79.3839347.


In [14]:
# Use foursquare APIcrednetials to enable access to venues data from the four square database
CLIENT_ID = 'OF35QXMM5TXNQ0BXPCP42UBURIZXZMCWFQ34O0W0HKPNVLUV' # Foursquare ID
CLIENT_SECRET = '2ROJOXMVFYBNWT5KZT4MAAMIKJ2OGC32A1OYZRPF52TKDPVH' # Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: OF35QXMM5TXNQ0BXPCP42UBURIZXZMCWFQ34O0W0HKPNVLUV
CLIENT_SECRET:2ROJOXMVFYBNWT5KZT4MAAMIKJ2OGC32A1OYZRPF52TKDPVH


In [19]:
import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [17]:
# exploring the first neighborhood in the df
Central_Toronto.loc[0, 'Neighborhood']
neighborhood_latitude = Central_Toronto.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = Central_Toronto.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = Central_Toronto.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Lawrence Park are 43.7280205, -79.3887901.


In [18]:
# exploring top 5 venues in Lawrence Park
LIMIT = 10 # limit of number of venues returned by Foursquare API

radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=OF35QXMM5TXNQ0BXPCP42UBURIZXZMCWFQ34O0W0HKPNVLUV&client_secret=2ROJOXMVFYBNWT5KZT4MAAMIKJ2OGC32A1OYZRPF52TKDPVH&v=20180605&ll=43.7280205,-79.3887901&radius=500&limit=10'

In [20]:
#results for top 5 venues
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ee6392bc7638759c3b0f402'},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.7325205045, 'lng': -79.3825744605273},
   'sw': {'lat': 43.7235204955, 'lng': -79.3950057394727}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '50e6da19e4b0d8a78a0e9794',
       'name': 'Lawrence Park Ravine',
       'location': {'address': '3055 Yonge Street',
        'crossStreet': 'Lawrence Avenue East',
        'lat': 43.72696303913755,
        'lng': -79.39438246708775,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.72696303913755,
          'lng': -79.39438246708775}],
        'distance': 465,
        'cc': 'CA',
  

In [21]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [22]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Lawrence Park Ravine,Park,43.726963,-79.394382
1,Zodiac Swim School,Swim School,43.728532,-79.38286
2,TTC Bus #162 - Lawrence-Donway,Bus Line,43.728026,-79.382805


# Exploring neighborhoods in Toronto


In [23]:
#define a function that will repeat the process above for all neighborhood in Toronto
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Now write the code to run the above function on each neighborhood and create a new dataframe called toronto_venues

In [24]:
toronto_venues = getNearbyVenues(names=Central_Toronto['Neighborhood'],
                                   latitudes=Central_Toronto['Latitude'],
                                   longitudes=Central_Toronto['Longitude']
                                  )

Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
North Toronto West, Lawrence Park
The Annex, North Midtown, Yorkville
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park


In [26]:
# Checking the size of the resulting dataframe

print(toronto_venues.shape)
toronto_venues.head()

(61, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
2,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
3,Roselawn,43.711695,-79.416936,Ceiling Champions,43.713891,-79.420702,Home Service
4,Roselawn,43.711695,-79.416936,Rosalind's Garden Oasis,43.712189,-79.411978,Garden


In [27]:
# Number of venues returned for each neighborhood in Central Toronto
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Davisville,10,10,10,10,10,10
Davisville North,9,9,9,9,9,9
"Forest Hill North & West, Forest Hill Road Park",4,4,4,4,4,4
Lawrence Park,3,3,3,3,3,3
"Moore Park, Summerhill East",2,2,2,2,2,2
"North Toronto West, Lawrence Park",10,10,10,10,10,10
Roselawn,3,3,3,3,3,3
"Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park",10,10,10,10,10,10
"The Annex, North Midtown, Yorkville",10,10,10,10,10,10


# Analysing each neighborhood in Central Toronto

In [55]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Breakfast Spot,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,Coffee Shop,...,Sandwich Place,Seafood Restaurant,Spa,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Trail,Vegetarian / Vegan Restaurant,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,Lawrence Park,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Roselawn,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Roselawn,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
toronto_onehot.shape

(61, 44)

# Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [57]:
C_Toronto = toronto_onehot.groupby('Neighborhood').mean().reset_index()
C_Toronto

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Breakfast Spot,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,Coffee Shop,...,Sandwich Place,Seafood Restaurant,Spa,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Trail,Vegetarian / Vegan Restaurant,Yoga Studio
0,Davisville,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.1,...,0.0,0.1,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
1,Davisville North,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,...,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Forest Hill North & West, Forest Hill Road Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0,0.0
3,Lawrence Park,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0
4,"Moore Park, Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"North Toronto West, Lawrence Park",0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.1,...,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.1
6,Roselawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Summerhill West, Rathnelly, South Hill, Forest...",0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,...,0.0,0.0,0.0,0.1,0.1,0.1,0.0,0.0,0.0,0.0
8,"The Annex, North Midtown, Yorkville",0.0,0.1,0.0,0.1,0.0,0.2,0.0,0.0,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0


# Print each neighborhood along with the top 5 most common venue

In [58]:
num_top_venues = 5

for hood in C_Toronto['Neighborhood']:
    print("----"+hood+"----")
    temp = C_Toronto[C_Toronto['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Davisville----
                venue  freq
0  Italian Restaurant   0.2
1        Dessert Shop   0.2
2    Sushi Restaurant   0.1
3   Indian Restaurant   0.1
4         Pizza Place   0.1


----Davisville North----
            venue  freq
0  Sandwich Place  0.11
1  Breakfast Spot  0.11
2            Park  0.11
3           Hotel  0.11
4     Pizza Place  0.11


----Forest Hill North & West, Forest Hill Road Park----
                 venue  freq
0        Jewelry Store  0.25
1                Trail  0.25
2     Sushi Restaurant  0.25
3   Mexican Restaurant  0.25
4  American Restaurant  0.00


----Lawrence Park----
                 venue  freq
0             Bus Line  0.33
1          Swim School  0.33
2                 Park  0.33
3  American Restaurant  0.00
4   Salon / Barbershop  0.00


----Moore Park, Summerhill East----
                 venue  freq
0                  Gym   0.5
1                 Park   0.5
2  American Restaurant   0.0
3    Indian Restaurant   0.0
4        Jewelry Store   0.0


# Putting into Data Frames

In [59]:
# define a function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [60]:
# this function creates the new dataframe and display the top 5 venues for each neighborhood
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = C_Toronto['Neighborhood']

for ind in np.arange(C_Toronto.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(C_Toronto.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Davisville,Italian Restaurant,Dessert Shop,Sushi Restaurant,Indian Restaurant,Café
1,Davisville North,Department Store,Sandwich Place,Gym,Food & Drink Shop,Park
2,"Forest Hill North & West, Forest Hill Road Park",Trail,Sushi Restaurant,Jewelry Store,Mexican Restaurant,Yoga Studio
3,Lawrence Park,Swim School,Bus Line,Park,Yoga Studio,Dessert Shop
4,"Moore Park, Summerhill East",Gym,Park,Yoga Studio,Hotel,Gym / Fitness Center


# Cluster Neighborhoods

In [61]:
# Run K-means to cluster neighborhhods into 3

# set number of clusters
kclusters = 3

C_Toronto_clustering = C_Toronto.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(C_Toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:5] 

array([0, 0, 0, 0, 2], dtype=int32)

#### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [62]:
# add clustering labels

neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

C_Toronto_Merged = Central_Toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
C_Toronto_Merged = C_Toronto_Merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

C_Toronto_Merged.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Swim School,Bus Line,Park,Yoga Studio,Dessert Shop
1,Central Toronto,Roselawn,43.711695,-79.416936,1,Ice Cream Shop,Garden,Home Service,Breakfast Spot,Burger Joint
2,Central Toronto,Davisville North,43.712751,-79.390197,0,Department Store,Sandwich Place,Gym,Food & Drink Shop,Park
3,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307,0,Trail,Sushi Restaurant,Jewelry Store,Mexican Restaurant,Yoga Studio
4,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678,0,Yoga Studio,Chinese Restaurant,Fast Food Restaurant,Mexican Restaurant,Diner


# Vizualizing Resulting Clusters

In [63]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(C_Toronto_Merged['Latitude'], C_Toronto_Merged['Longitude'], C_Toronto_Merged['Neighborhood'], C_Toronto_Merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters