# Import the Libraries Needed

In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests

r  = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

data = r.text

soup = BeautifulSoup(data, 'html.parser')

# Go Grab the table and pull the pieces out with Soup

Find the useful table
Work through each row
    - If the row has Not Assigned as a Borough - move on
    - If the row has no Neighborhood but a Borough - copy the Neighborhood
Save the dict into a dataframe

Now we have a dataframe but multiple rows per PostalCode and Borough

In [2]:
table = soup.find( "table", {"class":"wikitable sortable"} )

td = table.findAll('tr')[1:]

hoods = []

for data in td:
    col = data.find_all('td')
    details = {}
    for i,col in enumerate(col):
        if i == 1:
            details['Borough'] = (col.text.replace('\n',''))
            if details['Borough'] == 'Not assigned':
                break
        if i == 0:
            details['PostalCode'] = (col.text.replace('\n',''))
        if i == 2: 
            details['Neighborhood'] = (col.text.replace('\n',''))
            if details['Neighborhood'] == 'Not assigned':
                details['Neighborhood'] = details['Borough']
            hoods.append(details)

df=pd.DataFrame(hoods)
df=df[['PostalCode','Borough','Neighborhood']]
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


# Use Group by to do the work of concatinating the like Boroughs together

In [3]:
dfn=df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()
dfn.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Displaying the Shape of the Final Dataset

In [4]:
dfn.shape

(103, 3)

# Adding GEO data

Dowload spreadsheet into dataframe
The URL is dynamic and needs to be updated daily

In [6]:
geo_data=pd.read_csv('https://dl.boxcloud.com/d/1/b1!XtEtGjSNAljkdKB5XaHamJs3aV0zpvETdZZOiPnAc4mHP3wNXJ1OBQeH4yrtxi2ASp-7-Ry41nt-35tPDV61WHIyjOZnLtXFYQWCEtNOS3TIKtcBQOhQ-JzZFo8JrV8EptorxwI8oWxTntL9g14oHgn2LNiiswovBBnyRFa4TiwcHIJR8XJWj9t8QY3Em72g4oJjgyeyB2sSUUOCwG_TujOwJPS9M_ln__SX_t__sLFk40-FroQ8MxSR1fqQ8SCdS6eoli9Rz4itxT-vklqkSKxUVjsocTGAVXoSayw970AqSDt6X_WOwzI7_BFMtLfavhAMnSlv56PcWm_12N4qRy2yh4qn7y3bniLYMVtCgZdoyayhVd5pYr8TibCNm0XFWlWuFq8nO1tZuQ7urGf_Z-VnBV-wyH6f1-X3Knn5Xh_ew0SDhTV0t2HgnW7i8sBrm5_ckdYzHjjVk9Ed5Ash5GwQIiAtnWsFmK6mWnE3eVjEnMYxuqLLzOlBZ5dAkWrZkZx6H5p4UbM6vgoKZwAWUxlFs6JRdrE0PuoTzJ_XwACmnswH8tIpEEbRXhqyEWo-pvJk6K1OA-X4JytaQe1swbSPbmFOw1DVtJXQfqnvyfjIgPlgIiGWZ0Qs1rZdHp43hjd0UbEpFwI6LkRA5Eaiyv8JQhCm524TZ1XPijC0AaHORj_Swz20gkUO6ZXMlVXwnRahIQ07nca1sEiub2uEdOYb32j-E897Kp6fQ9OvdZV5WyGOeom5-nXw2s2ERAxFeLm7j8FddB3b3otTUfo34kGNmcJjyq4wTVfJ4d7x3OpbtQG1FrOEK8ht490-b5t05wsXYkuN6_y0iCcRufG7B8aZk8DzFBndgg-JHji-EbE4qQdHXU3aTDaE_NBMBSwnbO3xlq14EAze-fibZqXQCK53zf3YlhbmTiHWns8sREwESrfxSbY7piB9AvAz9U4m-falvSPJGtrn7GEdVCvGYjmXtwNKRtot9bRD1dWvlBvHva3tPV81Ygn_eUCWa_Bmg7MxS2Tf7zo7mIXeUQ_uwblx9jiwPIAaTBziuSOdSZvYxbQWjZuTnIOBglOIhyRxLOKE0iUYmHuBu0B3t0EEQ0Pa9CaUzh62uis9VIwiRvVEzMxJ6G_opAyJqyCvN-IEp4OEbYUNAdgRdqTIyqPIxgVcZX_UdM6cfwXwRlm7f78vtNq_mam_Enj1Nnitzvu1jbpVyjpTRLERIjNgQ9Q0aZHBUoCqYWW23hbBGJD77Q0OPTboFeI125b66TmqGEcQUPHo_jjhH0xA5CsLlBiF9al5E-A4lpheA9r8h3F4O2IQoWw7ezFui_Aa1Pv_hecwI9TH62KWmBPZKq0GwzT7RPGH-kB3dllGAez8uY1M5_A3cWiHNpQ7X9mJPoQROemh/download')
print(geo_data.shape)
geo_data.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
alldata=pd.merge(dfn, geo_data, how='left', left_on=['PostalCode'], right_on=['Postal Code'],
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)
print(alldata.shape)
alldata.head()

(103, 6)


Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


# KNN Analysis Section
Fist import the needed Libraries

In [14]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.17.0-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  20.93 MB/s
geopy-1.17.0-p 100% |################################| Time: 0:00:00  29.70 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00   7.80 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  34.36 MB/s
vincent-0.4.4- 100% |###################

## Fist cut down the number of neighborhoods researched to ones with Toronto in the name

In [12]:
analysis_data=alldata[alldata['Neighborhood'].str.contains("Toronto")]
print (analysis_data.shape)
analysis_data.head()

(7, 6)


Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
30,M3K,North York,"CFB Toronto,Downsview East",M3K,43.737473,-79.464763
40,M4J,East York,East Toronto,M4J,43.685347,-79.338106
46,M4R,Central Toronto,North Toronto West,M4R,43.715383,-79.405678
59,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",M5J,43.640816,-79.381752
60,M5K,Downtown Toronto,"Design Exchange,Toronto Dominion Centre",M5K,43.647177,-79.381576


## Find Our Starting Location

In [15]:

address = 'Toronto, ON, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


## Create map around the starting location and add in the neighborhoods

In [19]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(analysis_data['Latitude'], analysis_data['Longitude'], analysis_data['Borough'], analysis_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Setup to use Foursquare

In [40]:
radius=500
LIMIT=100
CLIENT_ID = 'XXXX' # your Foursquare ID
CLIENT_SECRET = 'XXX' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: XXXX
CLIENT_SECRET:XXX


## Make it easy to call Foursquare for each Neighborhood

In [21]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Get the data for each Neighborhood

In [25]:
toronto_venues = getNearbyVenues(names=analysis_data['Neighborhood'], latitudes=analysis_data['Latitude'], longitudes=analysis_data['Longitude'])
print(toronto_venues.shape)
toronto_venues.head()

CFB Toronto,Downsview East
East Toronto
North Toronto West
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Harbord,University of Toronto
Humber Bay Shores,Mimico South,New Toronto
(271, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"CFB Toronto,Downsview East",43.737473,-79.464763,Toronto Downsview Airport (YZD),43.738883,-79.470111,Airport
1,"CFB Toronto,Downsview East",43.737473,-79.464763,Ttc Bus #120 - Plewes Rd,43.734898,-79.464221,Bus Stop
2,"CFB Toronto,Downsview East",43.737473,-79.464763,Ancaster Park,43.733724,-79.465528,Park
3,East Toronto,43.685347,-79.338106,The Path,43.683923,-79.335007,Park
4,East Toronto,43.685347,-79.338106,Sammon Convenience,43.686951,-79.335007,Convenience Store


## Review and see how much data we have for each Neighborhood

In [29]:
toronto_venues.groupby('Neighborhood').count()
# print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"CFB Toronto,Downsview East",3,3,3,3,3,3
"Design Exchange,Toronto Dominion Centre",100,100,100,100,100,100
East Toronto,3,3,3,3,3,3
"Harbord,University of Toronto",34,34,34,34,34,34
"Harbourfront East,Toronto Islands,Union Station",100,100,100,100,100,100
"Humber Bay Shores,Mimico South,New Toronto",12,12,12,12,12,12
North Toronto West,19,19,19,19,19,19


In [30]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [31]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

# toronto_onehot.head()

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Airport,American Restaurant,Aquarium,Art Gallery,Asian Restaurant,Bakery,Bank,Bar,...,Steakhouse,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Wine Bar
0,"CFB Toronto,Downsview East",0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Design Exchange,Toronto Dominion Centre",0.0,0.0,0.04,0.0,0.01,0.01,0.01,0.0,0.02,...,0.02,0.0,0.0,0.01,0.02,0.01,0.01,0.0,0.0,0.01
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Harbord,University of Toronto",0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.058824,...,0.0,0.0,0.029412,0.0,0.0,0.029412,0.0,0.0,0.029412,0.0
4,"Harbourfront East,Toronto Islands,Union Station",0.0,0.0,0.0,0.05,0.01,0.0,0.02,0.01,0.02,...,0.01,0.01,0.01,0.01,0.0,0.01,0.02,0.01,0.0,0.01
5,"Humber Bay Shores,Mimico South,New Toronto",0.0,0.0,0.083333,0.0,0.0,0.0,0.083333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,North Toronto West,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"CFB Toronto,Downsview East",Airport,Park,Bus Stop,Wine Bar,French Restaurant,Convenience Store,Dance Studio,Deli / Bodega,Dessert Shop,Diner
1,"Design Exchange,Toronto Dominion Centre",Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Italian Restaurant,Gym,Gastropub,Deli / Bodega,Seafood Restaurant
2,East Toronto,Coffee Shop,Convenience Store,Park,Wine Bar,Food Court,Concert Hall,Dance Studio,Deli / Bodega,Dessert Shop,Diner
3,"Harbord,University of Toronto",Café,Bookstore,Bakery,Bar,Coffee Shop,Restaurant,Japanese Restaurant,Nightclub,Noodle House,Comfort Food Restaurant
4,"Harbourfront East,Toronto Islands,Union Station",Coffee Shop,Aquarium,Hotel,Pizza Place,Café,Restaurant,Scenic Lookout,Italian Restaurant,Brewery,Fried Chicken Joint
5,"Humber Bay Shores,Mimico South,New Toronto",Coffee Shop,Fried Chicken Joint,Restaurant,Café,Sandwich Place,Fast Food Restaurant,Pizza Place,Liquor Store,Pharmacy,Bakery
6,North Toronto West,Coffee Shop,Sporting Goods Shop,Yoga Studio,Sandwich Place,Grocery Store,Gift Shop,Fast Food Restaurant,Mexican Restaurant,Diner,Dessert Shop


## Import KMeans
Run it against the Toronto Data

In [34]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 0, 2, 0, 0, 4, 3], dtype=int32)

## Merge analysis cluster labels back into data

In [36]:
toronto_merged = analysis_data

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
30,M3K,North York,"CFB Toronto,Downsview East",M3K,43.737473,-79.464763,1,Airport,Park,Bus Stop,Wine Bar,French Restaurant,Convenience Store,Dance Studio,Deli / Bodega,Dessert Shop,Diner
40,M4J,East York,East Toronto,M4J,43.685347,-79.338106,0,Coffee Shop,Convenience Store,Park,Wine Bar,Food Court,Concert Hall,Dance Studio,Deli / Bodega,Dessert Shop,Diner
46,M4R,Central Toronto,North Toronto West,M4R,43.715383,-79.405678,2,Coffee Shop,Sporting Goods Shop,Yoga Studio,Sandwich Place,Grocery Store,Gift Shop,Fast Food Restaurant,Mexican Restaurant,Diner,Dessert Shop
59,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",M5J,43.640816,-79.381752,0,Coffee Shop,Aquarium,Hotel,Pizza Place,Café,Restaurant,Scenic Lookout,Italian Restaurant,Brewery,Fried Chicken Joint
60,M5K,Downtown Toronto,"Design Exchange,Toronto Dominion Centre",M5K,43.647177,-79.381576,0,Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Italian Restaurant,Gym,Gastropub,Deli / Bodega,Seafood Restaurant


## Create Chart Showing the like clusters

In [38]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters