# Segmenting and Clustering Neighborhoods in Toronto

## Part 1:
#### Imporing necessary libraries

In [1]:
#!pip install lxml
#!pip install geocoder
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import requests
import geocoder
import re
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium 
print('Libraries imported.')

Libraries imported.


#### Scraping the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, to obtain the data that is in the table of postal codes and transforming the data into a pandas dataframe

In [2]:
website_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
website_html = requests.get(website_url).text

In [3]:
website_dfs = pd.read_html(website_html)
website_dfs # contains list of tables from the wikipedia page

[    Postal Code           Borough  \
 0           M1A      Not assigned   
 1           M2A      Not assigned   
 2           M3A        North York   
 3           M4A        North York   
 4           M5A  Downtown Toronto   
 ..          ...               ...   
 175         M5Z      Not assigned   
 176         M6Z      Not assigned   
 177         M7Z      Not assigned   
 178         M8Z         Etobicoke   
 179         M9Z      Not assigned   
 
                                           Neighborhood  
 0                                                  NaN  
 1                                                  NaN  
 2                                            Parkwoods  
 3                                     Victoria Village  
 4                            Regent Park, Harbourfront  
 ..                                                 ...  
 175                                                NaN  
 176                                                NaN  
 177                

In [4]:
postal_df = website_dfs[0] # selecting the postal code table 
postal_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [5]:
print('Shape of postal_df is', postal_df.shape)

Shape of postal_df is (180, 3)


#### Dropping rows with a borough that is Not assigned

In [6]:
postal_df = postal_df[postal_df['Borough']!='Not assigned']
postal_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
print(' After dropping rows with Borough \'not asssigned\', shape of postal_df is', postal_df.shape)

 After dropping rows with Borough 'not asssigned', shape of postal_df is (103, 3)


#### Combining rows with same postal code into one row with the neighborhoods separated with comma

In [8]:
postal_df = postal_df.groupby(['Postal Code', 'Borough'])['Neighborhood'].apply('.'.join).reset_index()
postal_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
print('Shape of postal_df is', postal_df.shape)

Shape of postal_df is (103, 3)


#### Replacing neighborhood Not assigned with the row's borough value

In [10]:
postal_df = postal_df.replace(to_replace={'Neighborhood':['Not assigned', np.nan]},\
                              value={'Neighborhood':[postal_df['Borough'], postal_df['Borough']]})

postal_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Printing the number of rows of the final dataframe

In [11]:
print('Total number of rows in the final dataframe is', postal_df.shape[0])

Total number of rows in the final dataframe is 103


## Part 2:
#### Getting location coordinates of the neighborhoods using geocoder api and merging them with the postal code dataframe
##### P.S: In the assignment question geocoder.google method was told to be used to get the coordinates but as the method was not working properly, geocoder.arcgis method was used instead.

In [12]:
for i, row in postal_df.iterrows():
    lat_lng_coords = None   # initializing variable to None
    postal_code = row['Postal Code']
    
    # looping until getting the coordinates
    while(lat_lng_coords is None):
      g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    postal_df.loc[i, 'Latitude'] = latitude
    postal_df.loc[i, 'Longitude'] = longitude

postal_df.head()

Status code Unknown from https://geocode.arcgis.com/arcgis/rest/services/World/GeocodeServer/find: ERROR - HTTPSConnectionPool(host='geocode.arcgis.com', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://geocode.arcgis.com/arcgis/rest/services/World/GeocodeServer/find: ERROR - HTTPSConnectionPool(host='geocode.arcgis.com', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://geocode.arcgis.com/arcgis/rest/services/World/GeocodeServer/find: ERROR - HTTPSConnectionPool(host='geocode.arcgis.com', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://geocode.arcgis.com/arcgis/rest/services/World/GeocodeServer/find: ERROR - HTTPSConnectionPool(host='geocode.arcgis.com', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://geocode.arcgis.com/arcgis/rest/services/World/GeocodeServer/find: ERROR - HTTPSConnectionPool(host='geocode.arcgis.com', port=443): Read timed out. (read timeout=5.0)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.808626,-79.189913
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.785779,-79.157368
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765806,-79.185284
3,M1G,Scarborough,Woburn,43.771545,-79.218135
4,M1H,Scarborough,Cedarbrae,43.768791,-79.238813


## Part 3:
## Explore and cluster the neighborhoods in Toronto
#### Printing numbers of unique postal code, borough and neighborhood

In [13]:
print('The dataframe has {} postal codes, {} borough and {} neighborghoods'\
      .format(len(postal_df['Postal Code'].unique()), len(postal_df['Borough'].unique()),\
              postal_df['Neighborhood'].apply(lambda x : len(x.split(', '))).sum()))

The dataframe has 103 postal codes, 10 borough and 217 neighborghoods


#### Using geocoder library to get coordinates of Torondo City

In [14]:
geocoder_torondo = geocoder.arcgis('Toronto, Ontario')

torondo_lat_lng_coords = geocoder_torondo.latlng
torondo_latitude = torondo_lat_lng_coords[0]
torondo_longitude = torondo_lat_lng_coords[1]

print('The geograpical coordinate of Torondo City are {}, {}.'.format(torondo_latitude, torondo_longitude))

The geograpical coordinate of Torondo City are 43.648690000000045, -79.38543999999996.


#### Creating a map of Toronto City with neighborhoods superimposed on top.

In [15]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[torondo_latitude, torondo_longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(postal_df['Latitude'], postal_df['Longitude'], postal_df['Borough'], postal_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='lightblue',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Name of the boroughs in Toronto City

In [16]:
print('Boroughs in the dataset:\n')

for borough_name in postal_df['Borough'].unique().tolist():
    print(borough_name)

Boroughs in the dataset:

Scarborough
North York
East York
East Toronto
Central Toronto
Downtown Toronto
York
West Toronto
Mississauga
Etobicoke


### Exploring Neighborhood in the boroughs with 'Toronto' in their name
#### Name of the boroughs that have "Toronto" in their names

In [17]:
print('\nBoroughs that have \'Toronto\' in their names:\n')

for toronto_name in postal_df['Borough'][postal_df['Borough'].apply(lambda x : bool(re.search('(?i)toronto', x)))].unique().tolist():
    print(toronto_name)


Boroughs that have 'Toronto' in their names:

East Toronto
Central Toronto
Downtown Toronto
West Toronto


#### Selecting only boroughs that have 'Toronto' in their names

In [18]:
toronto_df = postal_df[postal_df['Borough'].apply(lambda x : bool(re.search('(?i)toronto', x)))]
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.678148,-79.295349
41,M4K,East Toronto,"The Danforth West, Riverdale",43.683424,-79.354564
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668291,-79.315578
43,M4M,East Toronto,Studio District,43.648,-79.33926
44,M4N,Central Toronto,Lawrence Park,43.729455,-79.386415


In [19]:
print('Number of rows in toronto_df data-frame is {} and the number of boroughs that have \'Toronto\' in their name is {}'.\
      format(toronto_df.shape[0], len(toronto_df['Borough'].unique())))

Number of rows in toronto_df data-frame is 39 and the number of boroughs that have 'Toronto' in their name is 4


#### Getting the geographical coordinates of the boroughs with "Toronto" in their names and determining the average of the coordinates

In [20]:
lat_lng_list = []

for borough in toronto_df['Borough'].unique():

    geocoder_borough = geocoder.arcgis('{}, Ontario'.format(borough))
    borough_lat_lng = geocoder_borough.latlng
    print('The geograpical coordinates for {} are {}.'.format(borough, tuple(borough_lat_lng)))
    lat_lng_list.append(borough_lat_lng)

toronto_lat_lng = np.mean(lat_lng_list, axis=0)
print('\nThe average of these geographical coordinates', tuple(toronto_lat_lng))

The geograpical coordinates for East Toronto are (43.65902991949926, -79.34900993060764).
The geograpical coordinates for Central Toronto are (43.60972747387443, -79.49284428510224).
The geograpical coordinates for Downtown Toronto are (43.65011000000004, -79.38289999999995).
The geograpical coordinates for West Toronto are (43.6647117748975, -79.34634588922047).

The average of these geographical coordinates (43.64589479206781, -79.39277502623257)


#### Creating a map of the neighborhoods in East Toronto, Central Toronto, Downtown Toronto and West Toronto

In [21]:
# creat map with the average geographical coordinates
map_toronto = folium.Map(location=[toronto_lat_lng[0], toronto_lat_lng[1]], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='lightblue',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Defining Foursquare Credentials and Version

In [22]:
CLIENT_ID = 'CST51LR5AA5BKXDUQGWPTUU43Z1FGFX2ZKNUH1MF3I4HSZIV' 
CLIENT_SECRET = 'YYMXSYE1UBP5BTY4QUBFGRH1FVFTVZOUPUCQOM4LC5GAM45Y'
VERSION = '20180605' # Foursquare API version

print('Credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Credentails:
CLIENT_ID: CST51LR5AA5BKXDUQGWPTUU43Z1FGFX2ZKNUH1MF3I4HSZIV
CLIENT_SECRET:YYMXSYE1UBP5BTY4QUBFGRH1FVFTVZOUPUCQOM4LC5GAM45Y


#### Defining function to get categories of nearby venues for the boroughs

In [23]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Running the above function on each neighborhood and create a new dataframe called *toronto_venues*.

In [24]:
toronto_venues = getNearbyVenues(names=toronto_df['Neighborhood'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude']
                                  )

The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West, Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High 

#### Size of the resulting dataframe

In [25]:
print(toronto_venues.shape)
toronto_venues.head()

(1592, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.678148,-79.295349,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.678148,-79.295349,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.678148,-79.295349,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.678148,-79.295349,Upper Beaches,43.680563,-79.292869,Neighborhood
4,The Beaches,43.678148,-79.295349,Calvary Baptist Church,43.681059,-79.299246,Church


#### Venues returned for each Neighborhood

In [26]:
toronto_venues.groupby('Neighborhood')['Neighborhood'].count()

Neighborhood
Berczy Park                                                                                                    63
Brockton, Parkdale Village, Exhibition Place                                                                   44
Business reply mail Processing Centre, South Central Letter Processing Plant Toronto                          100
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport     66
Central Bay Street                                                                                             55
Christie                                                                                                       12
Church and Wellesley                                                                                           84
Commerce Court, Victoria Hotel                                                                                100
Davisville                                                                 

#### Unique categories can be curated from all the returned venues

In [27]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 222 uniques categories.


### Analyze Each Neighborhood

In [28]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# adding neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# moveing neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
print('Shape of the new dataframe',toronto_onehot.shape)

Shape of the new dataframe (1592, 222)


#### Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [30]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,Berczy Park,0.015873,0.0,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.022727,0.0,0.0,0.022727,0.0,0.022727,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.02,0.01,0.0,0.0,0.03,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015152,0.0,...,0.0,0.0,0.0,0.0,0.015152,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.018182,0.018182,0.018182,0.0,0.0


In [31]:
print('Shape of the new dataframe',toronto_grouped.shape)

Shape of the new dataframe (39, 222)


#### Printing each neighborhood along with the top 5 most common venues

In [32]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                venue  freq
0         Coffee Shop  0.10
1        Cocktail Bar  0.05
2                Café  0.03
3          Restaurant  0.03
4  Seafood Restaurant  0.03


----Brockton, Parkdale Village, Exhibition Place----
                    venue  freq
0                    Café  0.07
1             Coffee Shop  0.07
2             Pizza Place  0.05
3  Thrift / Vintage Store  0.05
4               Gift Shop  0.05


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
              venue  freq
0       Coffee Shop  0.07
1             Hotel  0.05
2  Asian Restaurant  0.03
3        Restaurant  0.03
4              Café  0.03


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
               venue  freq
0               Café  0.06
1        Coffee Shop  0.06
2               Park  0.05
3  French Restaurant  0.05
4         Restaurant  0.05


----Central Bay Street----
      

#### Creating new dataframe for the top 10 venues for each neighborhood.

In [33]:
# defining function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [34]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Restaurant,Beer Bar,Bakery,Seafood Restaurant,Breakfast Spot,Hotel,Cheese Shop,Café
1,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Restaurant,Gift Shop,Pizza Place,Thrift / Vintage Store,Cocktail Bar,Sandwich Place,Caribbean Restaurant,Liquor Store
2,"Business reply mail Processing Centre, South C...",Coffee Shop,Hotel,Restaurant,Japanese Restaurant,Asian Restaurant,Café,Thai Restaurant,Taco Place,Mediterranean Restaurant,Bar
3,"CN Tower, King and Spadina, Railway Lands, Har...",Café,Coffee Shop,Restaurant,Park,French Restaurant,Bar,Gym,Speakeasy,Lounge,Gym / Fitness Center
4,Central Bay Street,Coffee Shop,Bubble Tea Shop,Middle Eastern Restaurant,Restaurant,Sandwich Place,Plaza,Japanese Restaurant,Clothing Store,Café,Department Store


### Cluster Neighborhoods

#### Running *k*-means to cluster the neighborhood into 5 clusters.

In [35]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

#### Creating a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [36]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_df.copy()

toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches,43.678148,-79.295349,0,Church,Health Food Store,Pub,Trail,Distribution Center,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant
41,M4K,East Toronto,"The Danforth West, Riverdale",43.683424,-79.354564,0,Bus Line,Business Service,Park,Grocery Store,Discount Store,Women's Store,Dumpling Restaurant,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668291,-79.315578,0,Park,Sandwich Place,Fast Food Restaurant,Italian Restaurant,Pet Store,Pub,Coffee Shop,Restaurant,Movie Theater,Burrito Place
43,M4M,East Toronto,Studio District,43.648,-79.33926,0,Baseball Field,Business Service,Athletics & Sports,Government Building,Night Market,Women's Store,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
44,M4N,Central Toronto,Lawrence Park,43.729455,-79.386415,1,Bus Line,Swim School,Women's Store,Food,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant


#### Creating Map of the Neighborhood Clusters

In [37]:
# create map
map_clusters = folium.Map(location=[toronto_lat_lng[0], toronto_lat_lng[1]], zoom_start=11)

# set color scheme for the clusters
colors_array = cm.rainbow(np.linspace(0, 1, kclusters))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters
#### Examining each of the five clusters

#### Cluster 1

In [38]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,The Beaches,Church,Health Food Store,Pub,Trail,Distribution Center,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant
41,"The Danforth West, Riverdale",Bus Line,Business Service,Park,Grocery Store,Discount Store,Women's Store,Dumpling Restaurant,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
42,"India Bazaar, The Beaches West",Park,Sandwich Place,Fast Food Restaurant,Italian Restaurant,Pet Store,Pub,Coffee Shop,Restaurant,Movie Theater,Burrito Place
43,Studio District,Baseball Field,Business Service,Athletics & Sports,Government Building,Night Market,Women's Store,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
45,Davisville North,Food & Drink Shop,Breakfast Spot,Gym,Park,Department Store,Dog Run,Dumpling Restaurant,Fish Market,Fish & Chips Shop,Fast Food Restaurant
47,Davisville,Dessert Shop,Pizza Place,Italian Restaurant,Coffee Shop,Sandwich Place,Café,Sushi Restaurant,Indian Restaurant,Gas Station,Fast Food Restaurant
49,"Summerhill West, Rathnelly, South Hill, Forest...",Light Rail Station,Coffee Shop,Park,Supermarket,Liquor Store,Skating Rink,Women's Store,Donut Shop,Fast Food Restaurant,Farmers Market
51,"St. James Town, Cabbagetown",Coffee Shop,Bakery,Park,Café,Italian Restaurant,Restaurant,Pub,Pizza Place,Market,Pharmacy
52,Church and Wellesley,Coffee Shop,Japanese Restaurant,Restaurant,Gay Bar,Café,Pub,Sushi Restaurant,Bubble Tea Shop,Dance Studio,Men's Store
53,"Regent Park, Harbourfront",Pub,Café,Coffee Shop,Athletics & Sports,Bank,Food Truck,Distribution Center,Mediterranean Restaurant,Tech Startup,Bakery


#### Cluster 2

In [39]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
44,Lawrence Park,Bus Line,Swim School,Women's Store,Food,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant


#### Cluster 3

In [40]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
46,"North Toronto West, Lawrence Park",Playground,Gym Pool,Park,Garden,Women's Store,Dog Run,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant
48,"Moore Park, Summerhill East",Playground,Tennis Court,Gym,Park,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Ethiopian Restaurant
50,Rosedale,Playground,Grocery Store,Park,Candy Store,Dog Run,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant


#### Cluster 4

In [41]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
64,"Forest Hill North & West, Forest Hill Road Park",Gym / Fitness Center,Women's Store,Dog Run,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Ethiopian Restaurant


#### Cluster 5

In [42]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
63,Roselawn,IT Services,Women's Store,Donut Shop,Flower Shop,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant


#### Determining the weightage of each venue category for each cluster

In [43]:
toronto_grouped_labeled = toronto_grouped.copy()
toronto_grouped_labeled['cluster'] = kmeans.labels_
toronto_grouped_labeled.head()

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,cluster
0,Berczy Park,0.015873,0.0,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,...,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,0.0,0.0,0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.022727,0.0,0.0,0.022727,0.0,0.022727,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.02,0.01,0.0,0.0,0.03,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.0,0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015152,0.0,...,0.0,0.0,0.0,0.015152,0.0,0.0,0.0,0.0,0.0,0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018182,0.018182,0.018182,0.0,0.0,0


In [44]:
toronto_cluster_mean = toronto_grouped_labeled.groupby(by='cluster').mean()
toronto_cluster_mean

Unnamed: 0_level_0,Yoga Studio,Accessories Store,Afghan Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.004121,0.000689,0.000361,0.009375,0.007034,0.00086,0.00258,0.006723,0.013131,0.002453,...,0.001212,0.007323,0.000606,0.007853,0.000459,0.001772,0.003342,0.004428,0.001203,0.000309
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Top 5 venue categories for each cluster

In [45]:
for cluster_label ,row in toronto_cluster_mean.iterrows(): 
    print('For Cluster {}, top 5 venue categories: {}\n'.format(cluster_label+1,', '.join(return_most_common_venues(row,5))))

For Cluster 1, top 5 venue categories: Coffee Shop, Café, Park, Restaurant, Grocery Store

For Cluster 2, top 5 venue categories: Bus Line, Swim School, Women's Store, Food, Fish Market

For Cluster 3, top 5 venue categories: Playground, Park, Gym, Tennis Court, Grocery Store

For Cluster 4, top 5 venue categories: Gym / Fitness Center, Women's Store, Donut Shop, Flower Shop, Fish Market

For Cluster 5, top 5 venue categories: IT Services, Women's Store, Donut Shop, Flower Shop, Fish Market

