Load Data

In [40]:
import pandas as pd
df=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M',skiprows = [0])[0]

In [41]:
df.columns = ['PostalCode', 'Borough', 'Neighborhood']

In [42]:
df.drop(df.loc[df['Borough']=='Not assigned'].index, inplace=True)

Check the duplicated Postalcode

In [43]:
df['PostalCode'].value_counts()

M9V    8
M8Y    8
M5V    7
M9B    5
M4V    5
      ..
M3M    1
M4R    1
M7R    1
M3N    1
M1S    1
Name: PostalCode, Length: 103, dtype: int64

Remove the dupplicated postalcode

In [44]:
df.sort_values("PostalCode", inplace = True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
9,M1B,Scarborough,Rouge
10,M1B,Scarborough,Malvern
27,M1C,Scarborough,Port Union
26,M1C,Scarborough,Rouge Hill
25,M1C,Scarborough,Highland Creek


In [45]:
df.drop_duplicates("PostalCode", inplace = True) 

Replace Not assigned Neighborhood by Borough

In [46]:
df[df['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M9A,Queen's Park,Not assigned


In [47]:
mask = df['Neighborhood'] == "Not assigned"
df.loc[mask, 'Neighborhood'] = df.loc[mask, 'Borough']

In [48]:
df[df['PostalCode'] == 'M9A']

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M9A,Queen's Park,Queen's Park


In [49]:
df.shape

(103, 3)

Geolocalization

In [82]:
# import the geolocation cordinators 
coordinates = pd.read_csv('Geospatial_Coordinates.csv')

#join with current database 

df_new = df.set_index('PostalCode').join(coordinates.set_index('Postal Code')).reset_index()


#other way
#coordinates.rename(index=str, columns={"Postal Code": "Postcode"}, inplace = True)
#neighborhoods = pd.merge(can_df, coordinates, on='Postcode', how='inner')

In [83]:
df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Rouge,43.806686,-79.194353
1,M1C,Scarborough,Port Union,43.784535,-79.160497
2,M1E,Scarborough,Guildwood,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Exploration of the neighborhoods


In [84]:
import numpy as np
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors 
from sklearn.cluster import KMeans
import folium 
from pandas.io.json import json_normalize

We keep only the boroughs containing 'Toronto'

In [93]:
toronto = df_new[[x.find('Toronto')>=0 for x in df_new['Borough'].values]].reset_index(drop=True)

In [94]:
toronto.shape

(39, 5)

Find the geolocation of Toronto 

In [102]:
from geopy.geocoders import Nominatim 
address = 'Toronto City, ON'

geolocator = Nominatim(user_agent = 'ny_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print ('The geographical cordinate of Toronto City are {}, {}'.format(latitude, longitude))

The geographical cordinate of Toronto City are 43.653963, -79.387207


We create a map of Toronto and mark the remaining neighborhoods

In [103]:
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(toronto['Latitude'], toronto['Longitude'], toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

Definde credential Foursquare

In [104]:
CLIENT_ID = '3I22FVMIBODABLJYTYNYTU3NCFN2C4DPXSS1MFIO42MY4B2A' 
CLIENT_SECRET = 'RGKOTFDPW0ERARMJEWT20ULIIH3XGIIA3E5V0WVFTD51EB0Q'
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3I22FVMIBODABLJYTYNYTU3NCFN2C4DPXSS1MFIO42MY4B2A
CLIENT_SECRET:RGKOTFDPW0ERARMJEWT20ULIIH3XGIIA3E5V0WVFTD51EB0Q


Below we get a list of venues located in each neighborhood

In [105]:
LIMIT = 300
radius = 500 # define radius

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [106]:
toronto_venues = getNearbyVenues(names=toronto['Neighborhood'],
                                   latitudes=toronto['Latitude'],
                                   longitudes=toronto['Longitude']
                                  )

The Beaches
Riverdale
The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Summerhill East
South Hill
Rosedale
St. James Town
Church and Wellesley
Harbourfront
Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide
Harbourfront East
Design Exchange
Commerce Court
Roselawn
Forest Hill North
Yorkville
University of Toronto
Kensington Market
King and Spadina
Stn A PO Boxes 25 The Esplanade
Underground city
Christie
Dufferin
Little Portugal
Exhibition Place
High Park
Roncesvalles
Runnymede
Queen's Park
Business Reply Mail Processing Centre 969 Eastern


In [107]:
toronto_venues.shape

(1691, 7)

In [108]:
toronto_venues.head(10)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Glen Stewart Park,43.675278,-79.294647,Park
4,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
5,Riverdale,43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
6,Riverdale,43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop
7,Riverdale,43.679557,-79.352188,Cafe Fiorentina,43.677743,-79.350115,Italian Restaurant
8,Riverdale,43.679557,-79.352188,Mezes,43.677962,-79.350196,Greek Restaurant
9,Riverdale,43.679557,-79.352188,Dolce Gelato,43.677773,-79.351187,Ice Cream Shop


In [110]:
toronto_venues['Neighborhood'].value_counts()

St. James Town                                       141
Garden District                                      100
Commerce Court                                       100
Harbourfront East                                    100
Underground city                                     100
Design Exchange                                      100
Adelaide                                             100
Stn A PO Boxes 25 The Esplanade                       95
Central Bay Street                                    85
Kensington Market                                     85
Church and Wellesley                                  80
Berczy Park                                           56
Little Portugal                                       53
Harbourfront                                          46
Studio District                                       41
Riverdale                                             40
Queen's Park                                          38
Runnymede                      

In [111]:
print ('There are {} unique categories'.format(len(toronto_venues['Venue Category'].unique())))

There are 228 unique categories


Below we compute for each neighborhood the proportion of each venue categories

In [112]:
# one hot encoding
Toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
print(Toronto_grouped.shape)

num_top_venues = 5

for hood in Toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

(38, 228)
----Adelaide----
             venue  freq
0      Coffee Shop  0.06
1  Thai Restaurant  0.04
2             Café  0.04
3       Steakhouse  0.04
4       Restaurant  0.03


----Berczy Park----
                venue  freq
0         Coffee Shop  0.09
1          Steakhouse  0.04
2                Café  0.04
3            Beer Bar  0.04
4  Seafood Restaurant  0.04


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0  Light Rail Station  0.11
1         Yoga Studio  0.06
2       Auto Workshop  0.06
3          Comic Shop  0.06
4         Pizza Place  0.06


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.16
1  Italian Restaurant  0.05
2                Café  0.05
3      Ice Cream Shop  0.04
4        Burger Joint  0.04


----Christie----
           venue  freq
0  Grocery Store  0.22
1           Café  0.17
2           Park  0.11
3          Diner  0.06
4    Candy Store  0.06


----Church and Wellesley----
                 ven

Below we create a data frame neighborhoods_venues_sorted, representing the most common categories of venues for each neighborhood

In [113]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Adelaide,Coffee Shop,Steakhouse,Thai Restaurant
1,Berczy Park,Coffee Shop,Steakhouse,Farmers Market
2,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Auto Workshop
3,Central Bay Street,Coffee Shop,Café,Italian Restaurant
4,Christie,Grocery Store,Café,Park


Clustering the neighborhoods

In [115]:
# set number of clusters
kcluster = 3

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

In [120]:
# run k-means clustering
kmeans = KMeans(n_clusters=kcluster, random_state=0).fit(Toronto_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2,
       2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2])

In [121]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [127]:
Toronto_merged = toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Toronto_merged.shape # check the last columns!

(39, 9)

We show the labels obtained for each neighborhood on the following map

In [125]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kcluster)
ys = [i + x + (i*x)**2 for i in range(kcluster)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters