# Toronto Clusterting

In [1]:
# install and import libraries
!conda install -c conda-forge geopy --yes
!conda install -c conda-forge folium=0.5.0 --yes
!conda install -c anaconda beautifulsoup4 --yes


import requests
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         240 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.50-py_0         conda-forge
    geopy:         1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    certifi:       2019.

## Steps from Previous Notebooks

### Aquire and clean Data

In [4]:
wiki_page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(wiki_page.content, 'html.parser')

#creating DF
table = soup.find('tbody')
rows = table.select('tr')
row = [r.get_text() for r in rows]
df = pd.DataFrame(row)

#cleanling and organizing DF
df1 = df[0].str.split('\n', expand=True)
df1.rename(columns=df1.iloc[0], inplace=True)
df1.drop(df1.index[0], inplace=True)

# new df with no NAN Boroughs
df2 = df1[df1.Borough != 'Not assigned']

#new df with combined Neighborhoods
df3 = df2.groupby(['Postcode', 'Borough'], sort = False).agg(','.join)
df3.reset_index(inplace=True)

df3.replace("Not assigned", "Queen's Park", inplace=True)
df3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


### Create DF with Latitude and Longitude values

In [5]:
url = "http://cocl.us/Geospatial_data"
df_LL = pd.read_csv(url)

#rename
df_LL.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
#merge
df_final = pd.merge(df3, df_LL, on='Postcode')
df_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


## Clustering Neighborhoods by Distance

### Checking unique Boroughs and Neighborhoods in DF

In [7]:
print('This DF has {} Boroughs and {} Neighbourhoods.'.format(len(df_final['Borough'].unique()),df_final.shape[0]))

This DF has 11 Boroughs and 103 Neighbourhoods.


### Selecting only Boroughs with 'Toronto' in the name

In [9]:
df_TO=df_final[df_final['Borough'].str.contains('Toronto')]
df_TO.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
5,M9A,Downtown Toronto,Queen's Park,43.667856,-79.532242
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


### Generating a Map to see the clustering of Neighborhoods in Toronto

In [16]:
geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode('Toronto')
latitude = location.latitude
longitude = location.longitude

Toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

for lati, long, borough, neighborhood in zip(df_TO['Latitude'], df_TO['Longitude'], 
                                           df_TO['Borough'], df_TO['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lati, long],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_opacity=1.0,
        parse_html=False).add_to(Toronto_map)  
    
Toronto_map

## Foursquare Data

### FS connection vars

In [17]:
CLIENT_ID = 'BWGZTEQA531RHIK4GFUWI1AEWB10JFXOOHIGSCVQZCVYEJFT' # your Foursquare ID
CLIENT_SECRET = '5NZH5XVLAMLO1313KUAUSPIQM5EISKGMVTXCWO5T2RSLBGRN' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: BWGZTEQA531RHIK4GFUWI1AEWB10JFXOOHIGSCVQZCVYEJFT
CLIENT_SECRET:5NZH5XVLAMLO1313KUAUSPIQM5EISKGMVTXCWO5T2RSLBGRN


### Function to get data on individual neighbourhoods

In [21]:
LIMIT = 100
radius = 500
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lati, long in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
TO_venues = getNearbyVenues(names=df_TO['Neighbourhood'],
                                   latitudes=df_TO['Latitude'],
                                   longitudes=df_TO['Longitude']
                                  )

Harbourfront
Queen's Park
Ryerson,Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide,King,Richmond
Dovercourt Village,Dufferin
Harbourfront East,Toronto Islands,Union Station
Little Portugal,Trinity
The Danforth West,Riverdale
Design Exchange,Toronto Dominion Centre
Brockton,Exhibition Place,Parkdale Village
The Beaches West,India Bazaar
Commerce Court,Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North,Forest Hill West
High Park,The Junction South
North Toronto West
The Annex,North Midtown,Yorkville
Parkdale,Roncesvalles
Davisville
Harbord,University of Toronto
Runnymede,Swansea
Moore Park,Summerhill East
Chinatown,Grange Park,Kensington Market
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown,St. James Town
First Canadian Place,Underground city

In [24]:
TO = TO_venues
print(TO.shape)
TO.head()

(663, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront,43.662744,-79.321558,Rorschach Brewing Co.,43.663483,-79.319824,Brewery
1,Harbourfront,43.662744,-79.321558,Leslieville Farmers Market,43.664901,-79.319784,Farmers Market
2,Harbourfront,43.662744,-79.321558,The Sidekick,43.664484,-79.325162,Comic Shop
3,Harbourfront,43.662744,-79.321558,Chino Locos,43.664653,-79.325584,Burrito Place
4,Harbourfront,43.662744,-79.321558,Queen Margherita Pizza,43.664685,-79.324164,Pizza Place


In [25]:
TO.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",17,17,17,17,17,17
Berczy Park,17,17,17,17,17,17
"Brockton,Exhibition Place,Parkdale Village",17,17,17,17,17,17
Business Reply Mail Processing Centre 969 Eastern,17,17,17,17,17,17
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",17,17,17,17,17,17
"Cabbagetown,St. James Town",17,17,17,17,17,17
Central Bay Street,17,17,17,17,17,17
"Chinatown,Grange Park,Kensington Market",17,17,17,17,17,17
Christie,17,17,17,17,17,17
Church and Wellesley,17,17,17,17,17,17


In [26]:
print('There are {} uniques categories.'.format(len(TO['Venue Category'].unique())))

There are 16 uniques categories.


In [27]:
# one hot encoding
TO_OH = pd.get_dummies(TO[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
TO_OH['Neighborhood'] = TO['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [TO_OH.columns[-1]] + list(TO_OH.columns[:-1])
TO_OH = TO_OH[fixed_columns]

TO_OH.head()

Unnamed: 0,Neighborhood,Auto Workshop,Brewery,Burrito Place,Butcher,Comic Shop,Farmers Market,Fast Food Restaurant,Garden,Garden Center,Light Rail Station,Park,Pizza Place,Recording Studio,Restaurant,Skate Park,Yoga Studio
0,Harbourfront,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Harbourfront,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,Harbourfront,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,Harbourfront,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Harbourfront,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [29]:
TO_G = TO_OH.groupby('Neighborhood').mean().reset_index()

### Most common Venues

In [30]:
num_top_venues = 5

for hood in TO_G['Neighborhood']:
    print("----"+hood+"----")
    temp = TO_G[TO_G['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
                venue  freq
0  Light Rail Station  0.12
1       Auto Workshop  0.06
2             Brewery  0.06
3       Burrito Place  0.06
4             Butcher  0.06


----Berczy Park----
                venue  freq
0  Light Rail Station  0.12
1       Auto Workshop  0.06
2             Brewery  0.06
3       Burrito Place  0.06
4             Butcher  0.06


----Brockton,Exhibition Place,Parkdale Village----
                venue  freq
0  Light Rail Station  0.12
1       Auto Workshop  0.06
2             Brewery  0.06
3       Burrito Place  0.06
4             Butcher  0.06


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0  Light Rail Station  0.12
1       Auto Workshop  0.06
2             Brewery  0.06
3       Burrito Place  0.06
4             Butcher  0.06


----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara----
                venue  freq
0  Light Rail Station  

In [45]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = TO_G['Neighborhood']

for ind in np.arange(TO_G.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(TO_G.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted['1st Most Common Venue'].value_counts()

Light Rail Station    39
Name: 1st Most Common Venue, dtype: int64

## Clustering

### Since the most common type of Venue for all selected Neighborhoods is a Light Rail Station as indicated above, clustering via it, would be redundant.