## Week 3 - Assignment - Clustering & Segmenting

### Neighbourhoods in Toronto

<b>Import the required libraries</b>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import folium
import json
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

<b>Get web page containing data</b>

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'  # URL containing table
page = requests.get(url)  # pull page

<b>Parse web page for table</b>

In [3]:
soup = BeautifulSoup(page.content)

table = soup.find('table',{'class':'wikitable sortable'})  # find table in page

column_names = [header.text[:-1] for header in table.find_all('th')]  # find column names

row_data = table.find_all('tr')  # find rows
rows = list()
for row in row_data:
    cells = row.find_all('td')
    value = [cell.text[:-1] for cell in cells]
    rows.append(value)

<b>Convert table data into a Dataframe</b>

In [4]:
table_data = pd.DataFrame(rows[1:])
table_data.columns = column_names
table_data.tail()

Unnamed: 0,Postal Code,Borough,Neighbourhood
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."
179,M9Z,Not assigned,Not assigned


<b>Clean and process the Dataframe</b>

In [5]:
# Step A: Drop unassigned Boroughs
mask = table_data['Borough'] == 'Not assigned'
stepA = table_data[~mask]

# Step B: Expand Neighborhood column
stepB = pd.concat([stepA, stepA['Neighbourhood'].str.split(',', expand=True)], axis=1)
stepB.drop('Neighbourhood', inplace=True, axis=1)
stepB1 = stepB.iloc[:, 0:3]

# Step C: Drop any rows with empty values
stepC = stepB1.dropna()
stepC.reset_index(drop=True, inplace=True)
stepC.rename(columns={0: 'Neighborhood'}, inplace=True)
cleaned_df = stepC.copy(deep=True)

In [6]:
cleaned_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park
3,M6A,North York,Lawrence Manor
4,M7A,Downtown Toronto,Queen's Park


In [7]:
print(f'The table has {cleaned_df.shape[0]} rows and {cleaned_df.shape[1]} columns')

The table has 103 rows and 3 columns


<b>Get Coordinates from CSV</b>

In [8]:
geodata = pd.read_csv('Geospatial_Coordinates.csv')  # read GeoData CSV
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
print(f'The table has {geodata.shape[0]} rows and {geodata.shape[1]} columns')

The table has 103 rows and 3 columns


<b>Merge Cleaned Dataframe with GeoData</b>

In [10]:
combined_df = cleaned_df.merge(geodata, how='left', left_on='Postal Code', right_on='Postal Code')  # perform a left join
combined_df.tail()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
98,M8X,Etobicoke,The Kingsway,43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
100,M7Y,East Toronto,Business reply mail Processing Centre,43.662744,-79.321558
101,M8Y,Etobicoke,Old Mill South,43.636258,-79.498509
102,M8Z,Etobicoke,Mimico NW,43.628841,-79.520999


In [11]:
print(f'The dataframe has {len(combined_df.Borough.unique())} boroughs and {combined_df.Neighborhood.shape[0]} neighborhoods.')

The dataframe has 10 boroughs and 103 neighborhoods.


In [12]:
LATITUDE = 43.7238
LONGITUDE = -79.3886

map_main = folium.Map(location=[LATITUDE, LONGITUDE], zoom_start=11.45)

# add markers to map
for lat, long, borough, neighborhood in zip(combined_df['Latitude'], combined_df['Longitude'], \
                                            combined_df['Borough'], combined_df['Neighborhood']):
    label = f'{neighborhood}, {borough}'
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
                        [lat, long],
                        radius=5,
                        popup=label,
                        color='darkred',
                        fill=True,
                        fill_color='#eb5d36',
                        fill_opacity=0.7,
                        parse_html=False).add_to(map_main)  

map_main

### Foursquare Credentials

In [13]:
CLIENT_ID = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
CLIENT_SECRET = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
VERSION = '20180605'

In [14]:
venues_list = list()

print('Trying to fetch venues...')

for neighboorhood, latitude, longitude in \
zip(combined_df['Neighborhood'], combined_df['Latitude'], combined_df['Longitude']):
    
    # API request
    LIMIT = 100
    RADIUS = 500
    url = f'https://api.foursquare.com/v2/venues/explore?\
    &client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&v={VERSION}\
    &ll={latitude},{longitude}&radius={RADIUS}&limit={LIMIT}'

    # Get result
    results = requests.get(url).json()
    status = results['meta']['code']
    
    
    if  status == 200:
        venues = results["response"]["groups"][0]["items"]
    
        # Append to Venues list
        for venue in venues:
            venue_category = venue['venue']['categories'][0]['name']
            venue_name = venue['venue']['name']
            venue_lat = venue['venue']['location']['lat']
            venue_long = venue['venue']['location']['lng']
            venues_list.append([neighboorhood, latitude, longitude, venue_name, venue_lat, venue_long, venue_category]) 
    
    elif status == 429:
        print('Out of API calls for the day.')
    else:
        print(f'Error Type: {status}')
        break
print('Complete.')

Trying to fetch venues...
Complete.


In [15]:
nearby_venues = pd.DataFrame([venue for venue in venues_list])
nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', \
                             'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']

In [16]:
nearby_venues.tail()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
2146,Mimico NW,43.628841,-79.520999,Islington Florist & Nursery,43.630156,-79.518718,Flower Shop
2147,Mimico NW,43.628841,-79.520999,Koala Tan Tanning Salon & Sunless Spa,43.63137,-79.519006,Tanning Salon
2148,Mimico NW,43.628841,-79.520999,Once Upon A Child,43.631075,-79.51829,Kids Store
2149,Mimico NW,43.628841,-79.520999,Value Village,43.631269,-79.518238,Thrift / Vintage Store
2150,Mimico NW,43.628841,-79.520999,Kingsway Boxing Club,43.627254,-79.526684,Gym


In [17]:
nearby_venues.to_csv('toronto_neighborhood_venues.csv')

In [18]:
nearby_venues.groupby('Venue Category').count()

Unnamed: 0_level_0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude
Venue Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Accessories Store,1,1,1,1,1,1
Afghan Restaurant,1,1,1,1,1,1
Airport,2,2,2,2,2,2
Airport Food Court,1,1,1,1,1,1
Airport Gate,1,1,1,1,1,1
...,...,...,...,...,...,...
Warehouse Store,1,1,1,1,1,1
Wine Bar,10,10,10,10,10,10
Wings Joint,1,1,1,1,1,1
Women's Store,5,5,5,5,5,5


In [19]:
unique_categories = len(nearby_venues['Venue Category'].unique())
print(f'There are {unique_categories} uniques categories.')

There are 275 uniques categories.


In [20]:
# one hot encoding
toronto_onehot = pd.get_dummies(nearby_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = nearby_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.tail()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
2146,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2147,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2148,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2150,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0
1,Alderwood,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0
2,Bathurst Manor,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0
4,Bedford Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,Wexford,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.12500,0.0,0.0,0.0,0.0
87,Willowdale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.02381,0.0,0.0,0.0,0.0
88,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0
89,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.166667,0.00000,0.0,0.0,0.0,0.0


In [22]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [23]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Lounge,Latin American Restaurant,Skating Rink,Breakfast Spot,Dog Run
1,Alderwood,Pizza Place,Pharmacy,Pub,Pool,Sandwich Place
2,Bathurst Manor,Bank,Coffee Shop,Middle Eastern Restaurant,Sushi Restaurant,Ice Cream Shop
3,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Women's Store
4,Bedford Park,Thai Restaurant,Sandwich Place,Coffee Shop,Italian Restaurant,Restaurant


### Clustering

In [24]:
# set number of clusters
kclusters = 3

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1])

In [25]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = combined_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.dropna(inplace=True)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1,Park,Food & Drink Shop,Women's Store,Distribution Center,Department Store
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,French Restaurant,Coffee Shop,Intersection,Pizza Place,Portuguese Restaurant
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,0,Coffee Shop,Pub,Bakery,Park,Café
3,M6A,North York,Lawrence Manor,43.718518,-79.464763,0,Clothing Store,Women's Store,Coffee Shop,Boutique,Miscellaneous Shop
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,0,Coffee Shop,Diner,Yoga Studio,College Auditorium,Bar


In [26]:
# create map
map_clusters = folium.Map(location=[LATITUDE, LONGITUDE], zoom_start=11.45)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Borough'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.5).add_to(map_clusters)
       
map_clusters

## Examine Clusters

### Cluster 1
#### Coffee shops & Restuarants

In [27]:
cluster1 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [28]:
cluster1

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,North York,0,French Restaurant,Coffee Shop,Intersection,Pizza Place,Portuguese Restaurant
2,Downtown Toronto,0,Coffee Shop,Pub,Bakery,Park,Café
3,North York,0,Clothing Store,Women's Store,Coffee Shop,Boutique,Miscellaneous Shop
4,Downtown Toronto,0,Coffee Shop,Diner,Yoga Studio,College Auditorium,Bar
6,Scarborough,0,Fast Food Restaurant,Women's Store,Dog Run,Department Store,Dessert Shop
...,...,...,...,...,...,...,...
97,Downtown Toronto,0,Coffee Shop,Café,Hotel,Gym,Restaurant
98,Etobicoke,0,River,Pool,Curling Ice,Dance Studio,Deli / Bodega
99,Downtown Toronto,0,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Gay Bar,Restaurant
100,East Toronto,0,Light Rail Station,Auto Workshop,Park,Comic Shop,Pizza Place


In [29]:
max_type1 = cluster1['1st Most Common Venue'].value_counts().index[0]
count1 = cluster1['1st Most Common Venue'].value_counts()[0]

In [30]:
print(f'The most common venue in Cluster 1 is of the Category: "{max_type1}". There are {count1} of them.')

The most common venue in Cluster 1 is of the Category: "Coffee Shop". There are 22 of them.


### Cluster 2
#### Recreational & Shopping

In [31]:
cluster2 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1]  + list(range(5, toronto_merged.shape[1]))]]

In [32]:
cluster2

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,North York,1,Park,Food & Drink Shop,Women's Store,Distribution Center,Department Store
21,York,1,Park,Pool,Women's Store,Gourmet Shop,Cupcake Shop
35,East York,1,Park,Convenience Store,Metro Station,Women's Store,Department Store
61,Central Toronto,1,Park,Swim School,Bus Line,Dumpling Restaurant,Drugstore
64,York,1,Park,Women's Store,Dog Run,Department Store,Dessert Shop
66,North York,1,Park,Convenience Store,Electronics Store,Women's Store,Department Store
85,Scarborough,1,Park,Playground,Distribution Center,Deli / Bodega,Department Store
91,Downtown Toronto,1,Park,Playground,Trail,Drugstore,Donut Shop


In [33]:
max_type2 = cluster2['1st Most Common Venue'].value_counts().index[0]
count2 = cluster2['1st Most Common Venue'].value_counts()[0]

In [34]:
print(f'The most common venue in Cluster 2 is of the Category: "{max_type2}". There are {count2} of them.')

The most common venue in Cluster 2 is of the Category: "Park". There are 8 of them.


### Cluster 3

#### Sports & Women's shopping

In [35]:
cluster3 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [36]:
cluster3

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
57,North York,2,Baseball Field,Women's Store,Dessert Shop,Dim Sum Restaurant,Diner
101,Etobicoke,2,Baseball Field,Women's Store,Dessert Shop,Dim Sum Restaurant,Diner


In [37]:
max_type3 = cluster3['1st Most Common Venue'].value_counts().index[0]
count3 = cluster3['1st Most Common Venue'].value_counts()[0]

In [38]:
print(f'The most common venue in Cluster 3 is of the Category: "{max_type3}". There are {count3} of them.')

The most common venue in Cluster 3 is of the Category: "Baseball Field". There are 2 of them.
