# Assignment: Segmenting ans Clustering Neighborhoods in Toronto

## Part 1: Import Neighborhood data from Toronto

In [1]:
!pip install bs4



### Import necessary linaries for this assignment.

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup


### Webscrabing is used to get postalcode information.

In [3]:
#Create a BeautifulSoup object
url="https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969%22"
html_data=requests.get(url).text
soup=BeautifulSoup(html_data,"html5lib")


In [4]:
#find a html table in the web page
table = soup.find('table')

In [5]:
# Store data into dataframe
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

### Output: Table for Toronto

In [6]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [7]:
df.shape

(103, 3)

---

## Part 2: Get the latitude and the longitude coordinates

In [8]:
#Import geographical coordinates based on csv file (Geospatial_Coordinates.csv)
import os, types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.

if os.environ.get('RUNTIME_ENV_LOCATION_TYPE') == 'external':
    endpoint_9e302b3748ad4d0fb99564819af604ad = 'https://s3-api.us-geo.objectstorage.softlayer.net'
else:
    endpoint_9e302b3748ad4d0fb99564819af604ad = 'https://s3-api.us-geo.objectstorage.service.networklayer.com'

client_9e302b3748ad4d0fb99564819af604ad = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='JGP3pBDcTP0Nhc24Lt6jhx29O60BEW2H_VTAMubHH8cg',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url=endpoint_9e302b3748ad4d0fb99564819af604ad)

body = client_9e302b3748ad4d0fb99564819af604ad.get_object(Bucket='capstoneproject-donotdelete-pr-tc7c4wpkkv6w9i',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_GCoordinates = pd.read_csv(body)
df_GCoordinates.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
# Merge Neighborhood and Coordinates
mergeDf=df.merge(df_GCoordinates, left_on='PostalCode',right_on='Postal Code')
#rearrange columns
cols=['Postal Code','Borough','Neighborhood','Latitude','Longitude']
mergeDf=mergeDf[cols]
mergeDf.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


## Part 3: Explore and cluster neighborhoods

### Part 3a: Explore neighborhoods

In [10]:
#Select boroughs that contain the word Toronto
toronto_data = mergeDf[mergeDf['Borough'].str.contains('Toronto', na=False)].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [11]:
toronto_data.shape

(39, 5)

In [12]:
#Load venues 
nearby_venues1 = pd.read_json("https://raw.githubusercontent.com/ibm-developer-skills-network/yczvh-DataFilesForIBMProjects/master/segmenting_neighborhoods.json")    
nearby_venues1.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                 'Venue', 
                 'Venue Latitude', 
                 'Venue Longitude', 
                 'Venue Category']
Toronto_venues=nearby_venues1
Toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood, Morningside, West Hill",43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Sail Sushi,43.765951,-79.191275,Restaurant


In [13]:
# Calculate amount of venues for each neighborhood
Toronto_venues_count=Toronto_venues.groupby('Neighborhood').count()
Toronto_venues_count.drop(['Venue','Neighborhood Longitude','Venue Latitude','Venue Longitude','Venue Category'],axis=1, inplace=True)
Toronto_venues_count.rename(columns={'Neighborhood Latitude':'Number of Venues'}, inplace=True)
print('Amount of venues per Neighborhood:')
Toronto_venues_count

Amount of venues per Neighborhood:


Unnamed: 0_level_0,Number of Venues
Neighborhood,Unnamed: 1_level_1
Agincourt,4
"Alderwood, Long Branch",8
"Bathurst Manor, Wilson Heights, Downsview North",23
Bayview Village,4
"Bedford Park, Lawrence Manor East",22
...,...
Willowdale West,5
"Willowdale, Newtonbrook",1
Woburn,4
Woodbine Heights,8


In [14]:
# Calculate the amount of unique venue categories
print('There are {} uniques venue categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 241 uniques venue categories.


In [15]:
# Analyze each neighborhood (only for the 39 neighborhoods containing the word 'Toronto' )
#1. Prepare dataframe to find the most common venues of each neighborhood 
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = toronto_data['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = ['Neighborhood']  + [col for col in Toronto_onehot if col != 'Neighborhood']
Toronto_onehot = Toronto_onehot[fixed_columns]


Toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Garden District, Ryerson",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,St. James Town,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Berczy Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
Toronto_onehot.shape

(1337, 241)

In [17]:
#2. Calculate mean of the frequency of occurrence of each category
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped.shape

(39, 241)

In [18]:
#3. Top 10 venues for each neighborhood
# Write a function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [19]:
num_top_venues = 10
# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Restaurant,Accessories Store,Motel,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant
1,"Brockton, Parkdale Village, Exhibition Place",Caribbean Restaurant,Accessories Store,Movie Theater,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant
2,"CN Tower, King and Spadina, Railway Lands, Har...",Bakery,Accessories Store,Market,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant
3,Central Bay Street,Mexican Restaurant,Accessories Store,Adult Boutique,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Middle Eastern Restaurant
4,Christie,Rental Car Location,Accessories Store,Motel,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant


In [20]:
neighborhoods_venues_sorted.shape

(39, 11)

### Part 3b: Cluster neighborhood

In [54]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# The neighborhoods will be clutered into 4 clusters
kcluster =4
Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kcluster, random_state=0).fit(Toronto_grouped_clustering)


In [55]:
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 3, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [56]:
#Add cluster number
toronto_cluster = toronto_data
toronto_cluster['Cluster Labels']=kmeans.labels_

#Add top 10 venues
toronto_cluster=toronto_cluster.merge(neighborhoods_venues_sorted, left_on='Neighborhood',right_on='Neighborhood')


toronto_cluster.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Fast Food Restaurant,Accessories Store,Market,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Bar,Accessories Store,Market,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3,Bank,Accessories Store,Market,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Electronics Store,Accessories Store,Gym / Fitness Center,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0,Restaurant,Accessories Store,Motel,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant


In [46]:
toronto_cluster.shape

(39, 16)

In [27]:
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [28]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [59]:
#visualize the resulting clusters

latitude=43.6511
longitude=-79.38

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kcluster)
ys = [i + x + (i*x)**2 for i in range(kcluster)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_cluster['Latitude'], toronto_cluster['Longitude'], toronto_cluster['Neighborhood'], toronto_cluster['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [60]:
# Excermine Clusters
#Cluster 1:
toronto_cluster.loc[toronto_cluster['Cluster Labels'] == 0, toronto_cluster.columns[[1]+[2] + list(range(5, toronto_cluster.shape[1]))]]

Unnamed: 0,Borough,Neighborhood,Cluster Labels,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,"Regent Park, Harbourfront",0,Fast Food Restaurant,Accessories Store,Market,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant
1,Downtown Toronto,"Garden District, Ryerson",0,Bar,Accessories Store,Market,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant
3,East Toronto,The Beaches,0,Electronics Store,Accessories Store,Gym / Fitness Center,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant
4,Downtown Toronto,Berczy Park,0,Restaurant,Accessories Store,Motel,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant
5,Downtown Toronto,Central Bay Street,0,Mexican Restaurant,Accessories Store,Adult Boutique,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Middle Eastern Restaurant
6,Downtown Toronto,Christie,0,Rental Car Location,Accessories Store,Motel,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant
7,Downtown Toronto,"Richmond, Adelaide, King",0,Medical Center,Accessories Store,Adult Boutique,Martial Arts School,Massage Studio,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant
8,West Toronto,"Dufferin, Dovercourt Village",0,Breakfast Spot,Accessories Store,Plaza,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant
9,East York/East Toronto,The Danforth East,0,Coffee Shop,Accessories Store,Movie Theater,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant
10,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",0,Coffee Shop,Accessories Store,Movie Theater,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant


In [61]:
#Cluster 2: 
toronto_cluster.loc[toronto_cluster['Cluster Labels'] == 1, toronto_cluster.columns[[1]+[2] + list(range(5, toronto_cluster.shape[1]))]]

Unnamed: 0,Borough,Neighborhood,Cluster Labels,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
15,East Toronto,"India Bazaar, The Beaches West",1,Thai Restaurant,Accessories Store,Lounge,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant
33,Downtown Toronto,Rosedale,1,Bakery,Accessories Store,Market,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant
35,Downtown Toronto,"St. James Town, Cabbagetown",1,Soccer Field,Accessories Store,Playground,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant


In [62]:
#Cluster 3: 
toronto_cluster.loc[toronto_cluster['Cluster Labels'] == 2, toronto_cluster.columns[[1]+[2] + list(range(5, toronto_cluster.shape[1]))]]

Unnamed: 0,Borough,Neighborhood,Cluster Labels,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
29,Central Toronto,"Moore Park, Summerhill East",2,Bus Line,Accessories Store,Plaza,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant
31,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",2,Bus Station,Accessories Store,Plaza,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant


In [64]:
print ('3 clusters were created. Cluster 3 clould be named "Traffic - Cluster", due to the reason that the most commen venue are stations. Cluster 1 and 2 are quiet similiar.')

3 clusters were created. Cluster 3 clould be named "Traffic - Cluster", due to the reason that the most commen venue are stations. Cluster 1 and 2 are quiet similiar.
