In [8]:
! pip install "ipython-beautifulsoup[bs4]"

Defaulting to user installation because normal site-packages is not writeable


In [9]:
#Importing Libraries

from bs4 import BeautifulSoup
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import requests # library to handle requests
import json 
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import urllib.request
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
import sys
! pip install --user folium
import folium # map rendering library



## Scraping Wikipedia

In [10]:
#Scraping wikipedia

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = urllib.request.urlopen(url)
article = req.read().decode()
article2 = str(article.encode(sys.stdout.encoding, errors='replace'))
#print(article2)
with open('Postal_Codes-M.html', 'w') as fo:
    fo.write(article2)
    
article = open('Postal_Codes-M.html').read()
soup = BeautifulSoup(article, 'html.parser')
table = soup.find('table', class_='sortable')
table_rows = table.find_all('tr')
l = []
for tr in table_rows:
    if(('Not assigned' in str(tr))==False):
        td = tr.find_all('td')
        row=[tr.text for tr in td]
        l.append(row)
df = pd.DataFrame(l)
df.drop(index = 0, inplace = True)
df.columns = ['Postal Code','Borough','Neighbourhood' ]
for column in df:
    df[column] = df[column].str.replace('\\\\n','')
df.head()


Unnamed: 0,Postal Code,Borough,Neighbourhood
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,"Regent Park, Harbourfront"
4,M6A,North York,"Lawrence Manor, Lawrence Heights"
5,M7A,Downtown Toronto,"Queen\'s Park, Ontario Provincial Government"


In [11]:
url = 'http://cocl.us/Geospatial_data'
coords = requests.get(url, allow_redirects=True)
open('c_table.csv', 'wb').write(coords.content)    
coord_table = pd.read_csv('c_table.csv')

In [12]:
coord_table.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
df_coord = pd.merge(df, coord_table, on='Postal Code', how='right')
df_coord.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen\'s Park, Ontario Provincial Government",43.662301,-79.389494


In [14]:
df_coord["Borough"] = df_coord["Borough"].astype('category')
df_coord["Borough_num"] = df_coord["Borough"].cat.codes
df_coord.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Borough_num
0,M3A,North York,Parkwoods,43.753259,-79.329656,6
1,M4A,North York,Victoria Village,43.725882,-79.315572,6
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,6
4,M7A,Downtown Toronto,"Queen\'s Park, Ontario Provincial Government",43.662301,-79.389494,1


In [15]:
long = 43.7043
lat = -79.3883
# create map of Manhattan using latitude and longitude values
map_gta = folium.Map(location=[long, lat], zoom_start=10)

nboroughs = 10

# set color scheme for the boroughs
x = np.arange(nboroughs)
ys = [i + x + (i*x)**2 for i in range(nboroughs)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to map
for lat, lng, label, pc, borough in zip(df_coord['Latitude'], df_coord['Longitude'], df_coord['Neighbourhood'], df_coord['Postal Code'], df_coord['Borough_num']):
    label = folium.Popup(str(label)+', '+str(pc), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=6,
        popup=label,
        color=rainbow[borough-1],
        fill=True,
        fill_color=rainbow[borough-1],
        fill_opacity=1,
        parse_html=False).add_to(map_gta)  
    
map_gta

In [16]:
df_coord.groupby('Borough').count()

Unnamed: 0_level_0,Postal Code,Neighbourhood,Latitude,Longitude,Borough_num
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Central Toronto,9,9,9,9,9
Downtown Toronto,19,19,19,19,19
East Toronto,5,5,5,5,5
East York,5,5,5,5,5
Etobicoke,12,12,12,12,12
Mississauga,1,1,1,1,1
North York,24,24,24,24,24
Scarborough,17,17,17,17,17
West Toronto,6,6,6,6,6
York,5,5,5,5,5


In [17]:
Toronto_df = df_coord[df_coord['Borough'].str.contains('Toronto')]
Toronto_df.set_index(['Postal Code'], inplace=True)
Toronto_df.reset_index(inplace = True)
Toronto_df.drop(['Borough_num'], axis = 1, inplace = True)
Toronto_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen\'s Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [18]:
CLIENT_ID = 'BWOB1P21JS33S2PXKD3GZFTBMCXAYROV5XDSAJR4Y2EINCUT' # your Foursquare ID
CLIENT_SECRET = '1QLIPVLNYGB40IUESTCFUO3ISCM003N1H31MFUYTJT5OKEI3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

radius = 500
LIMIT = 100

Your credentails:
CLIENT_ID: BWOB1P21JS33S2PXKD3GZFTBMCXAYROV5XDSAJR4Y2EINCUT
CLIENT_SECRET:1QLIPVLNYGB40IUESTCFUO3ISCM003N1H31MFUYTJT5OKEI3


In [19]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [20]:

Toronto_venues = getNearbyVenues(names=Toronto_df['Neighbourhood'],
                                   latitudes=Toronto_df['Latitude'],
                                   longitudes=Toronto_df['Longitude']
                                  )

Regent Park, Harbourfront
Queen\'s Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
North Toronto West,  Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport

In [21]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))
Toronto_venues=Toronto_venues[~Toronto_venues['Venue Category'].isin(['Café','Coffee Shop'])]
Toronto_venues.head()

There are 234 uniques categories.


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Dominion Pub and Kitchen,43.656919,-79.358967,Pub
5,"Regent Park, Harbourfront",43.65426,-79.360636,Corktown Common,43.655618,-79.356211,Park


In [22]:
Toronto_venues['Venue Category']

0                      Bakery
2         Distribution Center
3                         Spa
4                         Pub
5                        Park
6                  Restaurant
7              Breakfast Spot
8        Gym / Fitness Center
9               Historic Site
10             Farmers Market
11             Breakfast Spot
14      Performing Arts Venue
17             Chocolate Shop
18                       Park
19               Dessert Shop
20                       Park
21          French Restaurant
24                     Bakery
25                Yoga Studio
27                    Theater
28                        Pub
29                    Theater
30                Event Space
31                 Shoe Store
32                     Bakery
33             Ice Cream Shop
34                        Pub
35                Art Gallery
36             Cosmetics Shop
37           Asian Restaurant
                ...          
1580         Sushi Restaurant
1582      Japanese Restaurant
1583      

In [23]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood'] 

Toronto_onehot.set_index('Neighborhood', inplace = True)
Toronto_onehot.reset_index(inplace = True)

Toronto_onehot.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.019608,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0625,0.0625,0.0625,0.125,0.1875,0.125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.019231,0.0,0.0,0.019231,0.0,0.0,0.019231


In [25]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [26]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Berczy Park,Cocktail Bar,Beer Bar,Bakery,Seafood Restaurant,Cheese Shop
1,"Brockton, Parkdale Village, Exhibition Place",Breakfast Spot,Gym,Bakery,Grocery Store,Furniture / Home Store
2,"Business reply mail Processing Centre, South C...",Light Rail Station,Auto Workshop,Smoke Shop,Brewery,Spa
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport Terminal,Airport,Airport Food Court
4,Central Bay Street,Italian Restaurant,Japanese Restaurant,Sandwich Place,Bar,Department Store


In [27]:
# set number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [28]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = Toronto_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

Toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1,Bakery,Pub,Park,Theater,Breakfast Spot
1,M7A,Downtown Toronto,"Queen\'s Park, Ontario Provincial Government",43.662301,-79.389494,1,Sushi Restaurant,Diner,Yoga Studio,Distribution Center,Restaurant
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1,Clothing Store,Cosmetics Shop,Japanese Restaurant,Bubble Tea Shop,Middle Eastern Restaurant
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Restaurant,Gastropub,American Restaurant,Cocktail Bar,Gym
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Health Food Store,Trail,Pub,Yoga Studio,Deli / Bodega


In [29]:
# create map of Manhattan using latitude and longitude values

long = 43.7043
lat = -79.3883
map_Toronto = folium.Map(location=[long, lat], zoom_start=11)


# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighbourhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_Toronto)
       
map_Toronto

In [30]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 0, Toronto_merged.columns[[2] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
18,Lawrence Park,0,Park,Construction & Landscaping,Swim School,Bus Line,Yoga Studio
33,Rosedale,0,Park,Playground,Trail,Dance Studio,Ethiopian Restaurant


In [31]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 1, Toronto_merged.columns[[2] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Regent Park, Harbourfront",1,Bakery,Pub,Park,Theater,Breakfast Spot
1,"Queen\'s Park, Ontario Provincial Government",1,Sushi Restaurant,Diner,Yoga Studio,Distribution Center,Restaurant
2,"Garden District, Ryerson",1,Clothing Store,Cosmetics Shop,Japanese Restaurant,Bubble Tea Shop,Middle Eastern Restaurant
3,St. James Town,1,Restaurant,Gastropub,American Restaurant,Cocktail Bar,Gym
4,The Beaches,1,Health Food Store,Trail,Pub,Yoga Studio,Deli / Bodega
5,Berczy Park,1,Cocktail Bar,Beer Bar,Bakery,Seafood Restaurant,Cheese Shop
6,Central Bay Street,1,Italian Restaurant,Japanese Restaurant,Sandwich Place,Bar,Department Store
7,Christie,1,Grocery Store,Park,Candy Store,Nightclub,Restaurant
8,"Richmond, Adelaide, King",1,Restaurant,Gym,Deli / Bodega,Hotel,Thai Restaurant
9,"Dufferin, Dovercourt Village",1,Bakery,Pharmacy,Middle Eastern Restaurant,Bank,Supermarket


In [32]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 2, Toronto_merged.columns[[2] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
29,"Moore Park, Summerhill East",2,Trail,Yoga Studio,Deli / Bodega,Event Space,Ethiopian Restaurant


In [33]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 3, Toronto_merged.columns[[2] + list(range(5, Toronto_merged.shape[1]))]].head()

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
19,Roselawn,3,Garden,Music Venue,Yoga Studio,Deli / Bodega,Ethiopian Restaurant


In [34]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 4, Toronto_merged.columns[[2] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
21,"Forest Hill North & West, Forest Hill Road Park",4,Jewelry Store,Trail,Mexican Restaurant,Sushi Restaurant,Yoga Studio


### Cluster Types

0: Park Heavy

1: Food Heavy

2: Outdoors

3: Gardens 

4: Shopping