# This notebook will be used for capstone project

In [20]:
import pandas as pd
import numpy as np
import bs4 as bs
import requests

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


print('Libraries imported.')
print ("Hello Capstone Project Course!")

Libraries imported.
Hello Capstone Project Course!


In [21]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

dfs = pd.read_html(url, header=0)
toronto_df = dfs[0]

toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Dropping the rows where Borough is Not assigned


In [22]:
toronto_df = toronto_df[toronto_df['Borough'] != 'Not assigned']


In [23]:
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


Functions for processing of data¶


In [24]:
def concat_neighborhoods(series):
    string = ''
    for item in series:
        string = string + item + ', '
    return string

def ret_unique(series):
    items = []
    for item in series:
        items.append(item.split(',')[0])
    return items

def set_neighborhood(borough, neighborhood):
    items = []
    for boro, item in zip(borough, neighborhood):
        if item == 'Not assigned':
            items.append(boro)
        elif item:
            items.append(item)
            
    return items

In [25]:
toronto_aggregated = toronto_df.groupby(toronto_df['Postcode']).aggregate(concat_neighborhoods)
toronto_aggregated = toronto_aggregated.reset_index()
toronto_aggregated['Neighbourhood'] = toronto_aggregated['Neighbourhood'].astype(str).str[:-2]
toronto_aggregated['Borough'] = ret_unique(toronto_aggregated['Borough'])
toronto_aggregated.columns = ['PostCode', 'Borough', 'Neighborhood']
toronto_aggregated['Neighborhood'] = set_neighborhood(toronto_aggregated['Borough'], toronto_aggregated['Neighborhood'])
toronto_aggregated.head()

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [26]:
print("The shape of Cleansed Toronto Dataframe is : ", toronto_aggregated.shape)


The shape of Cleansed Toronto Dataframe is :  (103, 3)


## Adding coordinates of each individual Neighborhood using geocoder package¶


In [27]:
from geopy.geocoders import Nominatim 
 

In [28]:
geodata = pd.read_csv('http://cocl.us/Geospatial_data')
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [29]:
toronto_merged = toronto_aggregated.join(geodata.set_index('Postal Code'), on='PostCode')
toronto_merged.head()

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [31]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


# Visualize the Neighborhoods¶


In [1]:
!pip install folium
import folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/4f/86/1ab30184cb60bc2b95deffe2bd86b8ddbab65a4fac9f7313c278c6e8d049/folium-0.9.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 12.4MB/s eta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.3.1 folium-0.9.1


In [32]:

from geopy.geocoders import Nominatim

geocoder = Nominatim()
g = geocoder.geocode('Toronto, Ontario')

t_latitude = g.latitude
t_longitude = g.longitude

print("Latitude and Logitude of Toronto, Ontario are: {}, {}".format(t_latitude, t_longitude))





Latitude and Logitude of Toronto, Ontario are: 43.653963, -79.387207


In [4]:
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 4.8MB/s ta 0:00:01
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [17]:
import geocoder

In [18]:
"""
latitude = []
longitude = []
for code in toronto_aggregated['PostCode']:
    lat_long_coords = None
    
    while lat_long_coords is None:
        g = geocoder.google('{}, Toronto, Ontario'.format(code))
        lat_long_coords = g.latlng
        
    latitude.append(lat_long_coords[0])
    longitude.append(lat_long_coords[1])
    
toronto_aggregated['Latitude'] = latitude
toronto_aggregated['Longitude'] = longitude
toronto_aggregated
"""

"\nlatitude = []\nlongitude = []\nfor code in toronto_aggregated['PostCode']:\n    lat_long_coords = None\n    \n    while lat_long_coords is None:\n        g = geocoder.google('{}, Toronto, Ontario'.format(code))\n        lat_long_coords = g.latlng\n        \n    latitude.append(lat_long_coords[0])\n    longitude.append(lat_long_coords[1])\n    \ntoronto_aggregated['Latitude'] = latitude\ntoronto_aggregated['Longitude'] = longitude\ntoronto_aggregated\n"

In [34]:
geodata = pd.read_csv('http://cocl.us/Geospatial_data')
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [35]:
toronto_merged = toronto_aggregated.join(geodata.set_index('Postal Code'), on='PostCode')
toronto_merged.head()

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Get Latitude and Longitude of Toronto



In [36]:
from geopy.geocoders import Nominatim

geocoder = Nominatim()
g = geocoder.geocode('Toronto, Ontario')

t_latitude = g.latitude
t_longitude = g.longitude

print("Latitude and Logitude of Toronto, Ontario are: {}, {}".format(t_latitude, t_longitude))

  app.launch_new_instance()


Latitude and Logitude of Toronto, Ontario are: 43.653963, -79.387207


In [37]:
toronto_map = folium.Map(location=[t_latitude, t_longitude], zoom_start=10)

for lat, lng, neighborhood in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood']):
    label = folium.Popup(neighborhood, parse_html=True)
    folium.CircleMarker(
    [lat, lng],
    radius = 5,
    popup = label,
    color= 'red', 
    fill = True,
    fill_color = '#3186cc', 
    fill_opacity = 0.5,
    parse_html = False).add_to(toronto_map)

toronto_map

Define Foursquare API Credentials¶


In [38]:
CLIENT_ID = 'MJC504CIE3JRA0HHZA2YASNHA43D5T5U4I2XA12I1FQLUPGT'
CLIENT_SECRET = 'LZOVD1V31M1NUVXNXX41S4KS1SDZN5KYHIMRVNWPZUAYFAPI'
VERSION = '20180605'

print('Credentials: \nClient ID: {}\nClient Secret: {}\nVersion: {}'.format(CLIENT_ID, CLIENT_SECRET, VERSION))

Credentials: 
Client ID: MJC504CIE3JRA0HHZA2YASNHA43D5T5U4I2XA12I1FQLUPGT
Client Secret: LZOVD1V31M1NUVXNXX41S4KS1SDZN5KYHIMRVNWPZUAYFAPI
Version: 20180605


In [39]:
neighborhood_name = toronto_merged.loc[0, 'Neighborhood']
neigh_lat = toronto_merged.loc[0, 'Latitude']
neigh_lon = toronto_merged.loc[0, 'Longitude']

print('Latitude and Longitude of {} are {}, {}'.format(neighborhood_name, neigh_lat, neigh_lon))

Latitude and Longitude of Rouge, Malvern are 43.806686299999996, -79.19435340000001


In [40]:
radius = 500
LIMIT = 100

uri = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        neigh_lat, neigh_lon,
        radius,
        LIMIT)

print(uri)

https://api.foursquare.com/v2/venues/explore?&client_id=MJC504CIE3JRA0HHZA2YASNHA43D5T5U4I2XA12I1FQLUPGT&client_secret=LZOVD1V31M1NUVXNXX41S4KS1SDZN5KYHIMRVNWPZUAYFAPI&v=20180605&ll=43.806686299999996,-79.19435340000001&radius=500&limit=100


In [41]:
results = requests.get(uri).json()
results

{'meta': {'code': 200, 'requestId': '5d30a6f2dbde110025d8590e'},
  'headerLocation': 'Malvern',
  'headerFullLocation': 'Malvern, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 2,
  'suggestedBounds': {'ne': {'lat': 43.8111863045, 'lng': -79.18812958073042},
   'sw': {'lat': 43.80218629549999, 'lng': -79.2005772192696}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bb6b9446edc76b0d771311c',
       'name': "Wendy's",
       'location': {'crossStreet': 'Morningside & Sheppard',
        'lat': 43.80744841934756,
        'lng': -79.19905558052072,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.80744841934756,
          'lng': -79.19905558052072}],
        'distance': 387,
        'cc': 'CA',
        'city': 'Toronto',
    

In [42]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [43]:
from pandas.io.json import json_normalize
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Wendy's,Fast Food Restaurant,43.807448,-79.199056
1,Interprovincial Group,Print Shop,43.80563,-79.200378


In [44]:
def get_nearby_venues(names, latitude, longitude, radius=500, LIMIT=100):
    
    venues_list = []
    for name,  lat, long in zip(names, latitude, longitude):
        print('Processing Neighborhood: ', name)
        url = "https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat, long,
        radius,
        LIMIT)

        results = requests.get(url).json()['response']['groups'][0]['items']

        venues_list.append([(
        name,
        lat,
        long,
        v['venue']['name'],
        v['venue']['location']['lat'],
        v['venue']['location']['lng'],
        v['venue']['categories'][0]['name']) for v in results])

        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns=['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']

        print('Done: ', name)
    
    return nearby_venues

In [45]:
toronto_venues = get_nearby_venues(names = toronto_merged['Neighborhood'], 
                                   latitude = toronto_merged['Latitude'], 
                                   longitude = toronto_merged['Longitude'],
                                  radius=700)

Processing Neighborhood:  Rouge, Malvern
Done:  Rouge, Malvern
Processing Neighborhood:  Highland Creek, Rouge Hill, Port Union
Done:  Highland Creek, Rouge Hill, Port Union
Processing Neighborhood:  Guildwood, Morningside, West Hill
Done:  Guildwood, Morningside, West Hill
Processing Neighborhood:  Woburn
Done:  Woburn
Processing Neighborhood:  Cedarbrae
Done:  Cedarbrae
Processing Neighborhood:  Scarborough Village
Done:  Scarborough Village
Processing Neighborhood:  East Birchmount Park, Ionview, Kennedy Park
Done:  East Birchmount Park, Ionview, Kennedy Park
Processing Neighborhood:  Clairlea, Golden Mile, Oakridge
Done:  Clairlea, Golden Mile, Oakridge
Processing Neighborhood:  Cliffcrest, Cliffside, Scarborough Village West
Done:  Cliffcrest, Cliffside, Scarborough Village West
Processing Neighborhood:  Birch Cliff, Cliffside West
Done:  Birch Cliff, Cliffside West
Processing Neighborhood:  Dorset Park, Scarborough Town Centre, Wexford Heights
Done:  Dorset Park, Scarborough Town

In [46]:
toronto_venues.head()


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
2,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
3,"Rouge, Malvern",43.806686,-79.194353,Tim Hortons,43.802,-79.198169,Coffee Shop
4,"Rouge, Malvern",43.806686,-79.194353,Lee Valley,43.803161,-79.199681,Hobby Shop


In [47]:
toronto_venues.groupby('Neighborhood').count()


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,8,8,8,8,8,8
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",15,15,15,15,15,15
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",10,10,10,10,10,10
"Alderwood, Long Branch",13,13,13,13,13,13
"Bathurst Manor, Downsview North, Wilson Heights",21,21,21,21,21,21
Bayview Village,8,8,8,8,8,8
"Bedford Park, Lawrence Manor East",30,30,30,30,30,30
Berczy Park,100,100,100,100,100,100
"Birch Cliff, Cliffside West",9,9,9,9,9,9


In [48]:
toronto_one_hot = pd.get_dummies(toronto_venues['Venue Category'])
toronto_one_hot.drop('Neighborhood', axis=1, inplace=True)
toronto_one_hot['Neighborhood'] = toronto_venues['Neighborhood']
category_columns = [toronto_one_hot.columns[-1]] + list(toronto_one_hot.columns[:-1])
toronto_one_hot = toronto_one_hot[category_columns]

toronto_venues_grouped = toronto_one_hot.groupby('Neighborhood').mean().reset_index()
toronto_venues_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
def get_most_common(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)
    return row_categories_sorted.index.values[0:num_top_venues]

In [50]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_venues_grouped['Neighborhood']

for ind in np.arange(toronto_venues_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = get_most_common(toronto_venues_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Bar,Steakhouse,Theater,Restaurant,Cosmetics Shop,Sushi Restaurant,Asian Restaurant,Hotel
1,Agincourt,Badminton Court,Clothing Store,Lounge,Pool Hall,Shanghai Restaurant,Breakfast Spot,Motorcycle Shop,Sandwich Place,Yoga Studio,Dog Run
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Pizza Place,BBQ Joint,Fast Food Restaurant,Chinese Restaurant,Pharmacy,Gym,Malay Restaurant,Park,Shop & Service,Noodle House
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Hardware Store,Pizza Place,Fast Food Restaurant,Beer Store,Fried Chicken Joint,Sandwich Place,Coffee Shop,Pharmacy,Comfort Food Restaurant
4,"Alderwood, Long Branch",Pizza Place,Convenience Store,Pharmacy,Pool,Athletics & Sports,Gas Station,Skating Rink,Sandwich Place,Pub,Gym
5,"Bathurst Manor, Downsview North, Wilson Heights",Coffee Shop,Park,Community Center,Sandwich Place,Sushi Restaurant,Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Supermarket,Diner
6,Bayview Village,Bank,Skate Park,Café,Grocery Store,Skating Rink,Japanese Restaurant,Chinese Restaurant,Donut Shop,Diner,Discount Store
7,"Bedford Park, Lawrence Manor East",Coffee Shop,Italian Restaurant,Pizza Place,Liquor Store,Thai Restaurant,Bagel Shop,Bakery,Sushi Restaurant,Juice Bar,Fast Food Restaurant
8,Berczy Park,Coffee Shop,Café,Restaurant,Hotel,Beer Bar,Park,Bakery,Cocktail Bar,Seafood Restaurant,Japanese Restaurant
9,"Birch Cliff, Cliffside West",College Stadium,Café,Diner,Discount Store,Park,Bank,General Entertainment,Skating Rink,Thai Restaurant,Dog Run


# Clustering of Neighborhoods¶


In [51]:
from sklearn.cluster import KMeans


In [None]:
kclusters = 7

toronto_clustering = toronto_venues_grouped.drop('Neighborhood', axis=1)

kmeans = KMeans(n_clusters = kclusters, random_state=0).fit(toronto_clustering)

kmeans.labels_[0:10]

In [None]:
neighborhoods_venues_sorted['Cluster Labels'] = kmeans.labels_

toronto_final = toronto_merged

#merge final data with most common venues
toronto_final = toronto_final.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_final['Cluster Labels'].fillna(7, inplace=True)
toronto_final['Cluster Labels'] = toronto_final['Cluster Labels'].astype(int)
toronto_final.head()