# IBM Data Science Capstone Project : Analysing Toronto, Canada Neighbourhood Data

#### Author : Ajay Rabidas

In [173]:
import numpy as np
import pandas as pd

In [174]:
#print('Hello Capstone Project Course!')

### Installing html parser libraries

In [175]:
#!conda install -c anaconda lxml
#!conda install -c anaconda BeautifulSoup

In [176]:
from bs4 import BeautifulSoup
import requests

In [177]:
columns=['PostalCode', 'Borough', 'Neighborhood']
neighborhoods=pd.DataFrame(columns=columns)

### scraping html from Canada wiki

In [178]:
source=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source,'lxml')
table=soup.find('table', class_='wikitable')

In [179]:
#Extracting table from wiki html page
for row in table.tbody.find_all('tr'):
    tr=row.text
    data=tr.split('\n')[1:-1]
    neighborhoods = neighborhoods.append({'PostalCode':data[0],
                                           'Borough': data[1],
                                          'Neighborhood': data[2]}, ignore_index=True)

In [180]:
neighborhoods.drop(0, inplace=True)
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [181]:
neighborhoods.shape

(288, 3)

### Data cleaning
##### 1. Dropping rows with Not Assigned Borough
##### 2. Replacing Not Assigned Neighbourhood with adjacent Borough

In [None]:
neighDF= neighborhoods[neighborhoods['Borough']!='Not assigned']
neighDF['Neighborhood'].mask(neighDF['Neighborhood'] =='Not assigned', neighDF['Borough'], inplace=True)

In [184]:
neighDF[neighDF['Borough'] =="Queen's Park"].head()

Unnamed: 0,PostalCode,Borough,Neighborhood
9,M7A,Queen's Park,Queen's Park


### Grouping rows based on PostalCode

In [185]:
#print(neighDF.shape, neighborhoods.shape)
neighborDF = neighDF.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda tags: ','.join(tags)).to_frame().reset_index()
neighborDF

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


### Adding Geospatial Data to Toronto dataframe

In [186]:
geographicData=pd.read_csv('http://cocl.us/Geospatial_data')
geographicData.rename(columns = {"Postal Code": "PostalCode"}, inplace=True)
geographicData.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merging Toronto neighborhood and geospatial data into a single consolidated dataframe

In [187]:
geoTorontoDF = pd.merge(neighborDF, geographicData, on='PostalCode', how='outer')
geoTorontoDF.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


## Exploring neighborhoods of Toronto on folium map

In [188]:
from geopy.geocoders import Nominatim
import folium

In [189]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Totonto, Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Totonto, Canada are 43.653963, -79.387207.


### Create a map of Toronto with neighborhoods superimposed on top.

In [190]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(geoTorontoDF['Latitude'], geoTorontoDF['Longitude'], geoTorontoDF['Borough'], geoTorontoDF['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Map is not visible in github sometimes.
#### To view map, you can access the notebook at :
https://eu-gb.dataplatform.cloud.ibm.com/analytics/notebooks/v2/e47a975d-f54b-4ab6-8a8c-96ff6e1bec68/view?access_token=2486d1a86c3b1e1c29520cde7ac52f43d0229fb120a807baec03adc023362e78

In [192]:
toronto_boroughs = ['East Toronto', 'Central Toronto', 'West Toronto']
toronto_central_df = geoTorontoDF[geoTorontoDF['Borough'].isin(toronto_boroughs)].reset_index(drop=True)
print(toronto_central_df.shape)
toronto_central_df.head()

geoTorontoDF['Borough'].unique()

(20, 5)


array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       "Queen's Park", 'Mississauga', 'Etobicoke'], dtype=object)

In [193]:
# Using Folium to create a Map of Toronto with Boroughs markers on top

map_toronto2 = folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, long, post, borough, neigh in zip(toronto_central_df['Latitude'], toronto_central_df['Longitude'], toronto_central_df['PostalCode'], toronto_central_df['Borough'], toronto_central_df['Neighborhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto2)
    
map_toronto2

In [194]:
CLIENT_ID = 'P3O1RKUG2TK4EBDPC3STSNTC5C5LUOS0S5AYSWUFSDJZ3WTS' # your Foursquare ID
CLIENT_SECRET = 'UT1TRRGKCYZUTLFSSX3PJXOPIYL2FIJZV52WYOIKECMSCFR4' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: P3O1RKUG2TK4EBDPC3STSNTC5C5LUOS0S5AYSWUFSDJZ3WTS
CLIENT_SECRET:UT1TRRGKCYZUTLFSSX3PJXOPIYL2FIJZV52WYOIKECMSCFR4


In [195]:
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(toronto_central_df['Latitude'], toronto_central_df['Longitude'], toronto_central_df['PostalCode'], toronto_central_df['Borough'], toronto_central_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [196]:
venues_df = pd.DataFrame(venues)
venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
print(venues_df.shape)
venues_df.head()

(413, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood


In [198]:
#how many venues per each postal code
venues_df.groupby(['PostalCode', 'Borough', 'Neighborhood'])['VenueName'].count()
len(venues_df['VenueCategory'].unique())

137

In [199]:

# venues in each area

# one hot encoding
toronto_central_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_central_onehot['PostalCode'] = venues_df['PostalCode'] 
toronto_central_onehot['Borough'] = venues_df['Borough'] 
toronto_central_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_central_onehot.columns[-3:]) + list(toronto_central_onehot.columns[:-3])
toronto_central_onehot = toronto_central_onehot[fixed_columns]

print(toronto_central_onehot.shape)
toronto_central_onehot.head()

(413, 140)


Unnamed: 0,PostalCode,Borough,Neighborhoods,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Auto Workshop,BBQ Joint,...,Tea Room,Tennis Court,Thai Restaurant,Theater,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [200]:
# Frequency of occurance of a category in an area

toronto_central_venues_freq = toronto_central_onehot.groupby(['PostalCode', 'Borough', 'Neighborhoods']).mean().reset_index()
print(toronto_central_venues_freq.shape)
toronto_central_venues_freq.head()

(20, 140)


Unnamed: 0,PostalCode,Borough,Neighborhoods,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Auto Workshop,BBQ Joint,...,Tea Room,Tennis Court,Thai Restaurant,Theater,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,M4E,East Toronto,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0
1,M4K,East Toronto,"The Danforth West,Riverdale",0.02381,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02381,0.0,0.0,0.0,0.02381
2,M4L,East Toronto,"The Beaches West,India Bazaar",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,East Toronto,Studio District,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316
4,M4N,Central Toronto,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [201]:
# Get 10 most occurance venue types in each area

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_central_venues_freq['PostalCode']
neighborhoods_venues_sorted['Borough'] = toronto_central_venues_freq['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = toronto_central_venues_freq['Neighborhoods']

for ind in np.arange(toronto_central_venues_freq.shape[0]):
    row_categories = toronto_central_venues_freq.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)

In [203]:
from sklearn.cluster import KMeans
kclusters = 4

toronto_central_venues_freq_clustering = toronto_central_venues_freq.drop(['PostalCode', 'Borough', 'Neighborhoods'], 1)
toronto_central_venues_freq_clustering.head()

kmeans = KMeans(n_clusters=kclusters, random_state=0, init="k-means++", n_init=12).fit(toronto_central_venues_freq_clustering)
print(kmeans.labels_)
toronto_central_clustered_df = toronto_central_df
toronto_central_clustered_df['Cluster'] = kmeans.labels_

toronto_central_clustered_df = toronto_central_clustered_df.join(neighborhoods_venues_sorted.drop(['Borough', 'Neighborhoods'], 1).set_index('PostalCode'), on='PostalCode')
toronto_central_clustered_df.sort_values(['Cluster'] + freqColumns, inplace=True)
toronto_central_clustered_df

[1 1 1 1 3 1 1 1 2 1 0 1 1 1 1 1 1 1 1 1]


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,M5N,Central Toronto,Roselawn,43.711695,-79.416936,0,Garden,Ice Cream Shop,Discount Store,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dog Run,Yoga Studio
14,M6J,West Toronto,"Little Portugal,Trinity",43.647927,-79.41975,1,Bar,Asian Restaurant,Coffee Shop,Men's Store,Pizza Place,Restaurant,New American Restaurant,Bakery,Cocktail Bar,Café
17,M6R,West Toronto,"Parkdale,Roncesvalles",43.64896,-79.456325,1,Breakfast Spot,Gift Shop,Cuban Restaurant,Bank,Eastern European Restaurant,Restaurant,Movie Theater,Bookstore,Dog Run,Bar
3,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Café,Coffee Shop,American Restaurant,Bakery,Italian Restaurant,Gastropub,Fish Market,Neighborhood,Music Store,Middle Eastern Restaurant
15,M6K,West Toronto,"Brockton,Exhibition Place,Parkdale Village",43.636847,-79.428191,1,Coffee Shop,Café,Breakfast Spot,Yoga Studio,Stadium,Gym,Furniture / Home Store,Intersection,Italian Restaurant,Performing Arts Venue
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049,1,Coffee Shop,Pub,American Restaurant,Sushi Restaurant,Fried Chicken Joint,Light Rail Station,Liquor Store,Sports Bar,Supermarket,Pizza Place
12,M5R,Central Toronto,"The Annex,North Midtown,Yorkville",43.67271,-79.405678,1,Coffee Shop,Sandwich Place,Café,Pizza Place,Burger Joint,History Museum,Indian Restaurant,Jewish Restaurant,Liquor Store,Cosmetics Shop
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,1,Coffee Shop,Yoga Studio,Sporting Goods Shop,Clothing Store,Dessert Shop,Chinese Restaurant,Diner,Rental Car Location,Salon / Barbershop,Mexican Restaurant
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,1,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Yoga Studio,Fruit & Vegetable Store,Pizza Place,Liquor Store,Juice Bar
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Health Food Store,Trail,Other Great Outdoors,Pub,Neighborhood,Coworking Space,Cuban Restaurant,Costume Shop,Cupcake Shop,Farmers Market


In [209]:
import matplotlib.cm as cm
import matplotlib.colors as colors
# show clusters on the map

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_central_clustered_df['Latitude'], toronto_central_clustered_df['Longitude'], toronto_central_clustered_df['PostalCode'], toronto_central_clustered_df['Borough'], toronto_central_clustered_df['Neighborhood'], toronto_central_clustered_df['Cluster']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=7,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-2],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters