# 1. Creating a table of postal codes and assigned neighbourhoods

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

data_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
start_page = requests.get(data_url)
start_soup = BeautifulSoup(start_page.content, 'lxml')

# find the first table in the document
table = start_soup.find('table')

# get all the rows
trs = table.find_all('tr')

In [2]:
# prepare a dataframe
df = pd.DataFrame(columns=['Postal Code', 'Borough', 'Neighborhood'])

# parse the rows except for the first one
for cell in trs[1:]:
    cell_values = []
    for child in cell.recursiveChildGenerator():
        if child.name:
            cell_values.append(child.text[:-1])
    if cell_values[1] != 'Not assigned':
        df = df.append({'Postal Code': cell_values[0],
                        'Borough': cell_values[1],
                        'Neighborhood': cell_values[1] if (cell_values[2] == 'Not assigned') else cell_values[2]},
                       ignore_index=True)

In [3]:
df.shape

(103, 3)

# 2. Adding neighbourhoods coordinates

In [4]:
# dowload geospatial data
coors = pd.read_csv("https://cocl.us/Geospatial_data")

In [94]:
# merge neighborhoods with their coordinates by postal code
full_df = pd.merge(df, coors, how='left', on=["Postal Code"])

# create unique neighborhoods' names with the help of postal codes
full_df['Neighborhood'] = full_df['Neighborhood'] + full_df["Postal Code"]
full_df['Neighborhood'].shape

(103,)

# 3. Preparing data

In [7]:
!conda install -c conda-forge geopy --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0           conda-forge
    geopy:          

In [10]:
!conda install -c conda-forge folium=0.5.0 --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         713 KB

The following NEW packages will be INSTALLED:

    altair:  4.1.0-py_1 conda-forge
    branca:  0.4.1-py_0 conda-forge
    folium:  0.5.0-py_0 conda-forge
    vincent: 0.4.4-py_1 conda-forge


Downloading and Extracting Packages
altair-4.1.0         | 614 KB    | #####

In [11]:
from geopy.geocoders import Nominatim
import folium
import json
import numpy as np

address = 'Toronto, CA'
geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [71]:
# check how the data looks like on the map
map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(full_df['Latitude'], full_df['Longitude'], full_df['Borough'], full_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map)  
    
map

In [129]:
# The code was removed by Watson Studio for sharing.

In [130]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        if not results:
            venues_list.append([(
                name, 
                lat, 
                lng, 
                'None', 
                0, 
                0,  
                'No venues')])
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [111]:
toronto_venues = getNearbyVenues(names=full_df['Neighborhood'], latitudes=full_df['Latitude'], longitudes=full_df['Longitude'])

In [113]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,ParkwoodsM3A,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,ParkwoodsM3A,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,ParkwoodsM3A,43.753259,-79.329656,Corrosion Service Company Limited,43.752432,-79.334661,Construction & Landscaping
3,Victoria VillageM4A,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria VillageM4A,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [114]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.shape

(2134, 273)

In [115]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.shape

(103, 273)

In [116]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [125]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,AgincourtM1S,Latin American Restaurant,Clothing Store,Breakfast Spot,Lounge,Skating Rink,Drugstore,Dumpling Restaurant,Donut Shop,Doner Restaurant,Deli / Bodega
1,"Alderwood, Long BranchM8W",Pizza Place,Skating Rink,Sandwich Place,Pool,Pub,Coffee Shop,Gym,Curling Ice,Drugstore,Donut Shop
2,"Bathurst Manor, Wilson Heights, Downsview Nort...",Bank,Coffee Shop,Bridal Shop,Ice Cream Shop,Supermarket,Deli / Bodega,Middle Eastern Restaurant,Mobile Phone Shop,Pizza Place,Pharmacy
3,Bayview VillageM2K,Japanese Restaurant,Café,Bank,Chinese Restaurant,Distribution Center,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Women's Store
4,"Bedford Park, Lawrence Manor EastM5M",Coffee Shop,Restaurant,Sandwich Place,Italian Restaurant,Grocery Store,Hobby Shop,Comfort Food Restaurant,Pharmacy,Pizza Place,Café


# Clustering

In [124]:
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

kclusters = 7

# run k-means clustering
kmeans = KMeans(init="k-means++", n_clusters=kclusters, n_init=12, random_state=0).fit(toronto_grouped_clustering)

In [126]:
# add clustering labels
if 'Cluster Labels' not in neighborhoods_venues_sorted:
    neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
neighborhoods_venues_sorted.shape

(103, 12)

In [127]:
toronto_merged = full_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,ParkwoodsM3A,43.753259,-79.329656,1,Park,Food & Drink Shop,Construction & Landscaping,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store,Curling Ice
1,M4A,North York,Victoria VillageM4A,43.725882,-79.315572,1,Coffee Shop,Portuguese Restaurant,Hockey Arena,Pizza Place,Intersection,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
2,M5A,Downtown Toronto,"Regent Park, HarbourfrontM5A",43.65426,-79.360636,1,Coffee Shop,Bakery,Pub,Park,Restaurant,Breakfast Spot,Theater,Café,Electronics Store,Distribution Center
3,M6A,North York,"Lawrence Manor, Lawrence HeightsM6A",43.718518,-79.464763,1,Clothing Store,Accessories Store,Furniture / Home Store,Women's Store,Arts & Crafts Store,Event Space,Miscellaneous Shop,Boutique,Athletics & Sports,Coffee Shop
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial GovernmentM7A",43.662301,-79.389494,1,Coffee Shop,Diner,Sushi Restaurant,Distribution Center,Sculpture Garden,Discount Store,Music Venue,Italian Restaurant,Mexican Restaurant,Beer Bar


In [128]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters