# data

## import library

In [1]:
import pandas as pd
import numpy as np
import bs4
import requests

In [2]:
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
wiki.raise_for_status()
wiki_txt = bs4.BeautifulSoup(wiki.text, 'html.parser')
wiki_tr = wiki_txt.select('tbody > tr > td')

In [3]:
wiki_tr

[<td>M1A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M2A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M3A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td>, <td>M4A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
 </td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
 </td>, <td>M6A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights

In [4]:
data = []
for tr in wiki_txt.tbody.find_all('tr'):
    data.append([td.get_text().strip() for td in tr.find_all('td')])

In [5]:
wiki = pd.DataFrame(data, columns=['Postal Code','Borough','Neighborhood'])
wiki = wiki.dropna()

In [6]:
wiki.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 288 entries, 1 to 288
Data columns (total 3 columns):
Postal Code     288 non-null object
Borough         288 non-null object
Neighborhood    288 non-null object
dtypes: object(3)
memory usage: 9.0+ KB


In [7]:
wiki.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [8]:
wiki = wiki[wiki['Borough'] != 'Not assigned']

In [9]:
wiki.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 211 entries, 3 to 287
Data columns (total 3 columns):
Postal Code     211 non-null object
Borough         211 non-null object
Neighborhood    211 non-null object
dtypes: object(3)
memory usage: 6.6+ KB


In [10]:
wiki.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [11]:
def neighborhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighborhood'].tolist()))
wiki_temp = wiki.groupby(['Postal Code', 'Borough'])
wiki_group = wiki_temp.apply(neighborhood_list).reset_index(name='Neighborhood')

In [12]:
wiki_group.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [13]:
wiki_group.loc[wiki_group['Neighborhood'] == 'Not assigned','Neighborhood'] = wiki_group.loc[wiki_group['Neighborhood'] == 'Not assigned','Borough']

In [14]:
wiki_group.shape

(103, 3)

import geocoder # import geocoder

In [15]:
geo_data = pd.read_csv('https://cocl.us/Geospatial_data')

In [16]:
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
toronto = pd.merge(wiki_group, geo_data, on='Postal Code')

In [18]:
toronto.shape

(103, 5)

In [19]:
toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# clustering

## import library

In [20]:
import json 
from geopy.geocoders import Nominatim 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium 
from tqdm import tqdm
from collections import deque
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

## geograpical data

In [21]:
geolocator = Nominatim(user_agent="Toronto")
location = geolocator.geocode('Toronto')
latitude = location.latitude
longitude = location.longitude

In [22]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=9)
for lat, lng, borough, neighborhood in zip(toronto['Latitude'], toronto['Longitude'],
                                           toronto['Borough'], toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

## Foursquare API

In [51]:
# Deleted personal data

In [24]:
def getNearbyVenues(names, latitudes, longitudes):
    venues=[]
    for name, lat, lng in tqdm(zip(names, latitudes, longitudes), total = names.size):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, limit)
        results = requests.get(url).json()["response"]['groups'][0]['items']
        venues.append([(name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], 
                        v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venues in venues for item in venues])
    nearby_venues.columns = ['Neighborhood','Neighborhood Latitude', 'Neighborhood Longitude', 'Venue',
                             'Venue Latitude', 'Venue Longitude','Venue Category']
    return(nearby_venues)

In [25]:
radius=500
limit = 100
toronto_venues = getNearbyVenues(toronto.Neighborhood,
                            toronto.Latitude,
                            toronto.Longitude)

100%|██████████| 103/103 [00:41<00:00,  3.18it/s]


In [26]:
toronto_venues.shape

(2234, 7)

In [27]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


### onehotcoding

In [28]:
toronto_oh = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_oh['Neighborhood'] = toronto_venues['Neighborhood'] 
fixed_columns = [toronto_oh.columns[-1]] + list(toronto_oh.columns[:-1])
toronto_oh = toronto_oh[fixed_columns]

In [29]:
toronto_oh.shape

(2234, 272)

In [30]:
toronto_oh.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### grouping

In [31]:
toronto_gp = toronto_oh.groupby('Neighborhood').mean().reset_index()

In [32]:
toronto_gp.shape

(100, 272)

In [33]:
toronto_gp.head()

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.01
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### venue sort

In [34]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False) 
    return row_categories_sorted.index.values[0:num_top_venues]

In [35]:
num_top_venues = 5
columns = ['Neighborhood']
indicators = ['st', 'nd', 'rd']

In [36]:
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

In [37]:
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_gp['Neighborhood']

In [38]:
for ind in np.arange(toronto_gp.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_gp.iloc[ind, :], num_top_venues)

In [39]:
neighborhoods_venues_sorted.shape

(100, 6)

In [40]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Thai Restaurant,Steakhouse,Bar
1,Agincourt,Lounge,Breakfast Spot,Skating Rink,Clothing Store,Drugstore
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Playground,Women's Store,Drugstore,Diner
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Coffee Shop,Fast Food Restaurant,Beer Store,Sandwich Place
4,"Alderwood, Long Branch",Pizza Place,Gym,Pool,Skating Rink,Coffee Shop


### kmean

In [41]:
kclusters = 5
toronto_gp_ct = toronto_gp.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_gp_ct)
kmeans.labels_[0:10] 

array([0, 0, 2, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [42]:
toronto_gp["Cluster Labels"] = kmeans.labels_
toronto_done = toronto.merge(toronto_gp, left_on = "Neighborhood", right_on = "Neighborhood", how = "outer")
toronto_done = toronto_done.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_done["Cluster Labels"] = toronto_done["Cluster Labels"].fillna(5).astype("int")

In [43]:
toronto_done.shape

(103, 282)

In [44]:
toronto_done.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,...,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3,Fast Food Restaurant,Drugstore,Diner,Discount Store,Dive Bar
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,History Museum,Bar,Women's Store,Dumpling Restaurant,Dive Bar
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,Breakfast Spot,Rental Car Location,Pizza Place,Electronics Store,Mexican Restaurant
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,Coffee Shop,Indian Restaurant,Korean Restaurant,Dumpling Restaurant,Discount Store
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,Fried Chicken Joint,Bakery,Hakka Restaurant,Caribbean Restaurant,Athletics & Sports


### create map

In [45]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_done['Latitude'],
                                  toronto_done['Longitude'],
                                  toronto_done['Neighborhood'],
                                  toronto_done['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# cluster

In [46]:
toronto_done.loc[toronto_done['Cluster Labels'] == 0,
                   ['Borough','1st Most Common Venue','2nd Most Common Venue',
                    '3rd Most Common Venue','4th Most Common Venue','5th Most Common Venue']]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,Scarborough,History Museum,Bar,Women's Store,Dumpling Restaurant,Dive Bar
2,Scarborough,Breakfast Spot,Rental Car Location,Pizza Place,Electronics Store,Mexican Restaurant
3,Scarborough,Coffee Shop,Indian Restaurant,Korean Restaurant,Dumpling Restaurant,Discount Store
4,Scarborough,Fried Chicken Joint,Bakery,Hakka Restaurant,Caribbean Restaurant,Athletics & Sports
6,Scarborough,Discount Store,Department Store,Chinese Restaurant,Bus Station,Coffee Shop
7,Scarborough,Bus Line,Bakery,Soccer Field,Intersection,Bus Station
8,Scarborough,American Restaurant,Motel,Dim Sum Restaurant,Discount Store,Dive Bar
9,Scarborough,College Stadium,Skating Rink,General Entertainment,Café,Empanada Restaurant
10,Scarborough,Indian Restaurant,Light Rail Station,Pet Store,Latin American Restaurant,Vietnamese Restaurant
11,Scarborough,Middle Eastern Restaurant,Shopping Mall,Sandwich Place,Auto Garage,Breakfast Spot


In [47]:
toronto_done.loc[toronto_done['Cluster Labels'] == 1,
                   ['Borough','1st Most Common Venue','2nd Most Common Venue',
                    '3rd Most Common Venue','4th Most Common Venue','5th Most Common Venue']]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
5,Scarborough,Playground,Women's Store,Dumpling Restaurant,Discount Store,Dive Bar


In [48]:
toronto_done.loc[toronto_done['Cluster Labels'] == 2,
                   ['Borough','1st Most Common Venue','2nd Most Common Venue',
                    '3rd Most Common Venue','4th Most Common Venue','5th Most Common Venue']]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
14,Scarborough,Park,Playground,Women's Store,Drugstore,Diner
23,North York,Park,Convenience Store,Bank,Women's Store,Dumpling Restaurant
25,North York,Fast Food Restaurant,Park,Bus Stop,Food & Drink Shop,Drugstore
30,North York,Park,Airport,Playground,Women's Store,Drugstore
40,East York,Park,Pizza Place,Convenience Store,Women's Store,Drugstore
44,Central Toronto,Park,Swim School,Bus Line,Women's Store,Donut Shop
50,Downtown Toronto,Park,Playground,Building,Trail,Electronics Store
72,North York,Park,Pub,Asian Restaurant,Japanese Restaurant,Empanada Restaurant
73,York,Field,Hockey Arena,Trail,Park,Event Space
74,York,Park,Women's Store,Market,Fast Food Restaurant,Comic Shop


In [49]:
toronto_done.loc[toronto_done['Cluster Labels'] == 3,
                   ['Borough','1st Most Common Venue','2nd Most Common Venue',
                    '3rd Most Common Venue','4th Most Common Venue','5th Most Common Venue']]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Scarborough,Fast Food Restaurant,Drugstore,Diner,Discount Store,Dive Bar
80,York,Fast Food Restaurant,Discount Store,Sandwich Place,Drugstore,Dive Bar


In [50]:
toronto_done.loc[toronto_done['Cluster Labels'] == 4,
                   ['Borough','1st Most Common Venue','2nd Most Common Venue',
                    '3rd Most Common Venue','4th Most Common Venue','5th Most Common Venue']]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
63,Central Toronto,Garden,Women's Store,Drugstore,Discount Store,Dive Bar
