# Explore and Cluster Toronto

## Task 1 scrape https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M for the postal codes

build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas  dataframe.

In [2]:
import pandas as pd
import numpy as np


In [3]:
#define the url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# read data from website into dataframe
dfs = pd.read_html(url)

# Get first table                                                                                                           
df = dfs[0]

# replace "Not assigned" to NaN
df.replace("Not assigned", np.nan, inplace = True)

#check if the Postal Code is unique
print(df.nunique())
print(df.shape)

Postal Code      180
Borough           10
Neighbourhood     99
dtype: int64
(180, 3)


### Create the dataframe according to the specifications
The dataframe will consist of three columns: PostalCode, Borough, and Neighbourhood.

Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

If a cell has a borough but a Not assigned  neighbourhood, then the neighborhood will be the same as the borough.

In [4]:
# drop rows where Borough is NaN
df.dropna(subset=["Borough"], axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

# replace NaN in Neighbourhood with Borough
df["Neighbourhood"].replace(np.nan, df["Borough"], inplace=True)

df.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Show the shape of the dataframe
In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [5]:
df.shape

(103, 3)

## Task 2 Add latitude and longitude to the data

Retrieve longitude and latitude for the postal codes and add this to the dataframe

In [6]:
#read CSV
df_LL = pd.read_csv("http://cocl.us/Geospatial_data")
df_LL.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
#Create data frame with the 5 columns
df1 = df.join(df_LL.set_index('Postal Code'), on='Postal Code')

In [7]:
#drop all rows that have NaN after joining the two dataframes
df1.dropna(subset=["Latitude"], axis=0, inplace=True)
df1.reset_index(drop=True, inplace=True)
df1.dropna(subset=["Longitude"], axis=0, inplace=True)
df1.reset_index(drop=True, inplace=True)
df1.shape

(103, 5)

In [8]:
df1.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## Task 3 Explore and cluster neighbourhoods of Toronto


download dependencies

In [9]:
import requests
from sklearn.cluster import KMeans
#!pip install folium
import folium 
import matplotlib.cm as cm
import matplotlib.colors as colors
#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values


Only boroughs that contain the word Toronto will be explored.

In [10]:
Toronto_Boroughs = df1[df1['Borough'].str.contains('Toronto',case=False)]
Toronto_Boroughs.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [11]:
Toronto_Boroughs.shape

(39, 5)

### Create a function to get the nearby venues
This function will retrieve the nearby venues 

In [13]:
def getNearbyVenues(postalcodes, names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for postalcode, name, lat, lng in zip(postalcodes, names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        #results = requests.get(url).json()["response"]['venues']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            postalcode,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    Toronto_nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    Toronto_nearby_venues.columns = ['Postal Code',
                  'Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(Toronto_nearby_venues)

Call the function with the data from the Toronto Boroughs

In [14]:
Toronto_venues = getNearbyVenues(postalcodes=Toronto_Boroughs['Postal Code'],
                                 names=Toronto_Boroughs['Neighbourhood'], 
                                 latitudes=Toronto_Boroughs['Latitude'], 
                                 longitudes=Toronto_Boroughs['Longitude'])


In [15]:
Toronto_venues.head()

Unnamed: 0,Postal Code,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M5A,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
1,M5A,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
2,M5A,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,M5A,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [16]:
Toronto_Categories = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

Toronto_Categories['Postal Code'] = Toronto_venues['Postal Code']
#Toronto_Categories['Neigbourhood'] =

fixed_columns = [Toronto_Categories.columns[-1]] + list(Toronto_Categories.columns[:-1])
Toronto_Categories = Toronto_Categories[fixed_columns]

Toronto_Categories.head()

Unnamed: 0,Postal Code,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
Toronto_Cat_Grouped = Toronto_Categories.groupby('Postal Code').mean().reset_index()
Toronto_Cat_Grouped

Unnamed: 0,Postal Code,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02381,0.0,...,0.02381,0.0,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.02381
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M4P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M4R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05
7,M4S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M4T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M4V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0


Create a *pandas* dataframe with the top 10 venues per area

Define a function that will return the top 10 venues in descending order

In [18]:
def return_most_common_venues(row):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:10]

Create the new dataframe

In [19]:
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(10):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
Postalcodes_venues_sorted = pd.DataFrame(columns=columns)
Postalcodes_venues_sorted['Postal Code'] = Toronto_Cat_Grouped['Postal Code']

for ind in np.arange(Toronto_Cat_Grouped.shape[0]):
    Postalcodes_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_Cat_Grouped.iloc[ind, :])

Postalcodes_venues_sorted.head()

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Neighborhood,Trail,Pub,Health Food Store,Doner Restaurant,Dog Run,Distribution Center,Donut Shop,Cuban Restaurant,Discount Store
1,M4K,Greek Restaurant,Coffee Shop,Italian Restaurant,Restaurant,Ice Cream Shop,Furniture / Home Store,Fruit & Vegetable Store,Pub,Pizza Place,Lounge
2,M4L,Fast Food Restaurant,Pet Store,Fish & Chips Shop,Park,Pizza Place,Movie Theater,Pub,Restaurant,Sandwich Place,Brewery
3,M4M,Coffee Shop,Gastropub,Brewery,Bakery,Café,American Restaurant,Yoga Studio,Neighborhood,Seafood Restaurant,Cheese Shop
4,M4N,Business Service,Park,Swim School,Bus Line,Yoga Studio,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center


### Cluster the postal codes

Run k-means to cluster the postal codes into 5 clusters

In [20]:
# set number of clusters
kclusters = 5

Toronto_Cat_Grouped_Clustering = Toronto_Cat_Grouped.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_Cat_Grouped_Clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 0, 0, 0, 2, 0, 0, 0, 3, 0])

Create a new dataframe that includes the cluster and the top 10 venues for each postal code

In [21]:
# add clustering labels
Postalcodes_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = Toronto_Boroughs

# merge to add latitude/longitude for each postal code / neighborhood
Toronto_merged = Toronto_merged.join(Postalcodes_venues_sorted.set_index('Postal Code'), on='Postal Code')

Toronto_merged.head() 

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Bakery,Pub,Park,Theater,Breakfast Spot,Café,Mexican Restaurant,Beer Store,Spa
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,College Cafeteria,Sushi Restaurant,Diner,Bar,Italian Restaurant,Japanese Restaurant,Beer Bar,Distribution Center,Sandwich Place
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Japanese Restaurant,Italian Restaurant,Bubble Tea Shop,Cosmetics Shop,Electronics Store,Movie Theater
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Coffee Shop,Café,Cocktail Bar,Gastropub,American Restaurant,Seafood Restaurant,Farmers Market,Bakery,Gym,Restaurant
19,M4E,East Toronto,The Beaches,43.676357,-79.293031,4,Neighborhood,Trail,Pub,Health Food Store,Doner Restaurant,Dog Run,Distribution Center,Donut Shop,Cuban Restaurant,Discount Store


Create map with clusters

In [22]:
address = 'Toronto'

geolocator = Nominatim(user_agent="Canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The coordinates of Toronto are 43.6534817, -79.3839347.


In [23]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighbourhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters