# Applied Data Science Capstone

In [36]:
#Required Libraries

import numpy as np
import pandas as pd
import json
import requests 
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

## Loading data from previous notebook

In [16]:
data = pd.read_csv('data.csv')
data.drop(['Unnamed: 0'],axis=1, inplace=True)

In [17]:
address = 'Toronto'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [18]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(data['Latitude'], data['Longitude'], data['Borough'], data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        ).add_to(map_toronto)  
    
map_toronto

In [19]:
# filter borough names that contain the word Toronto
borough_names = list(data.Borough.unique())

borough_with_toronto = []

for x in borough_names:
    if "toronto" in x.lower():
        borough_with_toronto.append(x)
        
borough_with_toronto

['Downtown Toronto',
 'East Toronto',
 'West Toronto',
 'East YorkEast Toronto',
 'Central Toronto']

In [20]:
# create a new DataFrame with only boroughs that contain the word Toronto
data = data[data['Borough'].isin(borough_with_toronto)].reset_index(drop=True)
data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Address,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park , Harbourfront","M5A, Downtown Toronto",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson","M5B, Downtown Toronto",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,"M5C, Downtown Toronto",43.651494,-79.375418
3,M4E,East Toronto,The Beaches,"M4E, East Toronto",43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,"M5E, Downtown Toronto",43.644771,-79.373306


In [27]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(data['Latitude'], data['Longitude'], data['Borough'], data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

## Using FourSquare API

In [32]:
CLIENT_ID = '' 
CLIENT_SECRET = ''
VERSION = '20180605'

In [39]:
radius = 500
LIMIT = 100

venues = []
for lat, long, post, borough, neighborhood in zip(data['Latitude'], data['Longitude'], data['PostalCode'], data['Borough'], data['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url,timeout=30).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [40]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

venues_df.groupby(["PostalCode", "Borough", "Neighborhood"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
PostalCode,Borough,Neighborhood,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M4E,East Toronto,The Beaches,5,5,5,5,5,5
M4J,East YorkEast Toronto,The Danforth East,4,4,4,4,4,4
M4K,East Toronto,"The Danforth West , Riverdale",41,41,41,41,41,41
M4L,East Toronto,"India Bazaar , The Beaches West",20,20,20,20,20,20
M4M,East Toronto,Studio District,41,41,41,41,41,41
M4N,Central Toronto,Lawrence Park,3,3,3,3,3,3
M4P,Central Toronto,Davisville North,8,8,8,8,8,8
M4R,Central Toronto,North Toronto West,19,19,19,19,19,19
M4S,Central Toronto,Davisville,34,34,34,34,34,34
M4T,Central Toronto,"Moore Park , Summerhill East",3,3,3,3,3,3


In [42]:
venues_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


## The number of unique categories from all returned values

In [43]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))
venues_df['VenueCategory'].unique()[:50]

There are 238 uniques categories.


array(['Bakery', 'Coffee Shop', 'Distribution Center', 'Spa',
       'Breakfast Spot', 'Restaurant', 'Park', 'Historic Site', 'Pub',
       'Farmers Market', 'Chocolate Shop', 'Dessert Shop', 'Theater',
       'Performing Arts Venue', 'French Restaurant', 'Café',
       'Mexican Restaurant', 'Event Space', 'Ice Cream Shop',
       'Asian Restaurant', 'Shoe Store', 'Art Gallery', 'Cosmetics Shop',
       'Electronics Store', 'Bank', 'Beer Store', 'Hotel',
       'Health Food Store', 'Antique Shop', 'Clothing Store',
       'Comic Shop', 'Pizza Place', 'Plaza', 'Tea Room', 'Burrito Place',
       'Music Venue', 'Ramen Restaurant', 'Burger Joint',
       'Thai Restaurant', 'Diner', 'Movie Theater', 'Sandwich Place',
       'Steakhouse', 'Sporting Goods Shop', 'Shopping Mall',
       'American Restaurant', 'Japanese Restaurant', 'College Rec Center',
       'Gastropub', 'Bookstore'], dtype=object)

In [44]:
# one hot encoding
toronto_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_onehot['PostalCode'] = venues_df['PostalCode'] 
toronto_onehot['Borough'] = venues_df['Borough'] 
toronto_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_onehot.columns[-3:]) + list(toronto_onehot.columns[:-3])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

(1695, 241)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M5A,Downtown Toronto,"Regent Park , Harbourfront",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,Downtown Toronto,"Regent Park , Harbourfront",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,Downtown Toronto,"Regent Park , Harbourfront",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,Downtown Toronto,"Regent Park , Harbourfront",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
toronto_grouped = toronto_onehot.groupby(["PostalCode", "Borough", "Neighborhoods"]).mean().reset_index()
toronto_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M4E,East Toronto,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4J,East YorkEast Toronto,The Danforth East,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M4K,East Toronto,"The Danforth West , Riverdale",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.02439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02439
3,M4L,East Toronto,"India Bazaar , The Beaches West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4M,East Toronto,Studio District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.02439,0.0,0.0,0.02439


## Displaying top 10 venues for each postal code

In [49]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']
neighborhoods_venues_sorted['Borough'] = toronto_grouped['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = toronto_grouped['Neighborhoods']

for ind in np.arange(toronto_grouped.shape[0]):
    row_categories = toronto_grouped.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

neighborhoods_venues_sorted

Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,Park,Health Food Store,Pub,Neighborhood,Trail,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dim Sum Restaurant,Donut Shop
1,M4J,East YorkEast Toronto,The Danforth East,Park,Convenience Store,Coffee Shop,Yoga Studio,Discount Store,Falafel Restaurant,Event Space,Ethiopian Restaurant,Empanada Restaurant,Electronics Store
2,M4K,East Toronto,"The Danforth West , Riverdale",Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Bookstore,Restaurant,Brewery,Café,Cosmetics Shop
3,M4L,East Toronto,"India Bazaar , The Beaches West",Sandwich Place,Park,Food & Drink Shop,Brewery,Burrito Place,Restaurant,Fast Food Restaurant,Fish & Chips Shop,Italian Restaurant,Intersection
4,M4M,East Toronto,Studio District,Café,Coffee Shop,American Restaurant,Bakery,Brewery,Italian Restaurant,Yoga Studio,Fish Market,Pet Store,Park
5,M4N,Central Toronto,Lawrence Park,Park,Bus Line,Swim School,Yoga Studio,Discount Store,Falafel Restaurant,Event Space,Ethiopian Restaurant,Empanada Restaurant,Electronics Store
6,M4P,Central Toronto,Davisville North,Gym,Hotel,Convenience Store,Department Store,Sandwich Place,Breakfast Spot,Food & Drink Shop,Park,General Entertainment,Event Space
7,M4R,Central Toronto,North Toronto West,Clothing Store,Coffee Shop,Yoga Studio,Sporting Goods Shop,Café,Chinese Restaurant,Dessert Shop,Diner,Fast Food Restaurant,Health & Beauty Service
8,M4S,Central Toronto,Davisville,Dessert Shop,Sandwich Place,Coffee Shop,Italian Restaurant,Pizza Place,Gym,Sushi Restaurant,Café,Park,Brewery
9,M4T,Central Toronto,"Moore Park , Summerhill East",Restaurant,Park,Playground,Yoga Studio,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


## k-means clustering

In [50]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop(["PostalCode", "Borough", "Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 1, 4, 4, 4, 1, 4, 4, 4, 3])

In [53]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
toronto_merged = data.copy()

# add clustering labels
toronto_merged["Cluster Labels"] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.drop(["Borough", "Neighborhoods"], 1).set_index("PostalCode"), on="PostalCode")

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Address,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park , Harbourfront","M5A, Downtown Toronto",43.65426,-79.360636,0,Coffee Shop,Pub,Bakery,Park,Mexican Restaurant,Theater,Café,Breakfast Spot,Restaurant,Antique Shop
1,M5B,Downtown Toronto,"Garden District, Ryerson","M5B, Downtown Toronto",43.657162,-79.378937,1,Coffee Shop,Clothing Store,Japanese Restaurant,Cosmetics Shop,Café,Middle Eastern Restaurant,Italian Restaurant,Electronics Store,Diner,Pizza Place
2,M5C,Downtown Toronto,St. James Town,"M5C, Downtown Toronto",43.651494,-79.375418,4,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Breakfast Spot,Beer Bar,Cosmetics Shop,Bakery,Diner
3,M4E,East Toronto,The Beaches,"M4E, East Toronto",43.676357,-79.293031,4,Park,Health Food Store,Pub,Neighborhood,Trail,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dim Sum Restaurant,Donut Shop
4,M5E,Downtown Toronto,Berczy Park,"M5E, Downtown Toronto",43.644771,-79.373306,4,Coffee Shop,Cocktail Bar,Seafood Restaurant,Café,Cheese Shop,Bakery,Beer Bar,Restaurant,Farmers Market,Beach


In [55]:
# sort the results by Cluster Labels
toronto_merged.sort_values(["Cluster Labels"], inplace=True)
toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Address,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park , Harbourfront","M5A, Downtown Toronto",43.65426,-79.360636,0,Coffee Shop,Pub,Bakery,Park,Mexican Restaurant,Theater,Café,Breakfast Spot,Restaurant,Antique Shop
24,M5R,Central Toronto,"The Annex , North Midtown , Yorkville","M5R, Central Toronto",43.67271,-79.405678,0,Café,Sandwich Place,Coffee Shop,History Museum,Pizza Place,Indian Restaurant,Pub,Liquor Store,BBQ Joint,Donut Shop
1,M5B,Downtown Toronto,"Garden District, Ryerson","M5B, Downtown Toronto",43.657162,-79.378937,1,Coffee Shop,Clothing Store,Japanese Restaurant,Cosmetics Shop,Café,Middle Eastern Restaurant,Italian Restaurant,Electronics Store,Diner,Pizza Place
5,M5G,Downtown Toronto,Central Bay Street,"M5G, Downtown Toronto",43.657952,-79.387383,1,Coffee Shop,Italian Restaurant,Sandwich Place,Japanese Restaurant,Burger Joint,Ice Cream Shop,Thai Restaurant,Gym / Fitness Center,Department Store,Middle Eastern Restaurant
23,M4R,Central Toronto,North Toronto West,"M4R, Central Toronto",43.715383,-79.405678,2,Clothing Store,Coffee Shop,Yoga Studio,Sporting Goods Shop,Café,Chinese Restaurant,Dessert Shop,Diner,Fast Food Restaurant,Health & Beauty Service


## Visualising the clusters

In [58]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Borough'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters