# Second assigment: Segmenting and Clustering Neighborhoods in Toronto

### Importing necessary libraries

In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# uncomment this line if you haven't completed the Foursquare API lab
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# uncomment this line if you haven't completed the Foursquare API lab
!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Libraries imported.


# Part 1. Data extraction and preparation

### Extracting Data from Wikipedia and creating a DataFrame

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
url

'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [4]:
df_TorontoData=pd.read_html(url)[0]
df_TorontoData

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


### Removed 'Not assigned' values from Borough column

In [5]:
df_TorontoData.drop(df_TorontoData[df_TorontoData['Borough'] == 'Not assigned'].index, inplace = True)
df_TorontoData = df_TorontoData.reset_index(drop=True)
df_TorontoData

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Although there is no remaining 'Not assigned' value in Neighbourhood, still ran below code to change it to be same as Borough

In [6]:
df_TorontoData.loc[df_TorontoData['Neighbourhood'] == 'Not assigned', ['Neighbourhood']] = df_TorontoData['Borough']

#### Part of the assignment was: 'More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.'

#### In this case, the information was already properly grouped in Wikipedia

In [7]:
df_TorontoData
#df_TorontoData.sort_values(by=['Postal Code'])

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [8]:
df_TorontoData.shape

(103, 3)

# Part 2. Adding coordinates (latitude and longitude) to Toronto DataFrame

In [9]:
url_coordinates= 'http://cocl.us/Geospatial_data'

In [10]:
df_Toronto_coor=pd.read_csv(url_coordinates)
df_Toronto_coor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Verifying shape of Toronto Data and Coordinates is the same

In [11]:
df_Toronto_coor.shape

(103, 3)

#### Joining Toronto Data with Toronto Coordinates in a new DataFrame

In [12]:
df_Toronto=df_TorontoData.join(df_Toronto_coor.set_index('Postal Code'), on='Postal Code')
df_Toronto

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [13]:
df_Toronto.shape

(103, 5)

# Part 3. Exploring Toronto

#### Review Toronto Map

In [14]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [15]:
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_Toronto['Latitude'], df_Toronto['Longitude'], df_Toronto['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

#### Selected Neighbourhoods that contains 'University' in the name

In [16]:
University_data = df_Toronto[df_Toronto['Neighbourhood'].str.contains("University")].reset_index(drop=True)
print(University_data.shape)
University_data.head()

(2, 5)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3J,North York,"Northwood Park, York University",43.76798,-79.487262
1,M5S,Downtown Toronto,"University of Toronto, Harbord",43.662696,-79.400049


#### To visualize University neighbourhoods

In [18]:
map_Toronto_U = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(University_data['Latitude'], University_data['Longitude'], University_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto_U)  
    
map_Toronto_U

#### Exploring York and Toronto University neighbourhoods

In [22]:
York_latitude = University_data.loc[0, 'Latitude']
York_longitude = University_data.loc[0, 'Longitude']

neighbourhood_name1 = University_data.loc[0, 'Neighbourhood']

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name1, 
                                                               York_latitude, 
                                                               York_longitude))

Latitude and longitude values of Northwood Park, York University are 43.7679803, -79.48726190000001.


#### Connecting with Foursquare

In [20]:
import os

CLIENT_ID = os.environ.get('Foursquare_ClientID')
CLIENT_SECRET = os.environ.get('Foursquare_ClientSecret')
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value
radius = 500

In [25]:
url_venues_York = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    York_latitude, 
    York_longitude, 
    radius, 
    LIMIT)

In [26]:
results1 = requests.get(url_venues_York).json()
results1

{'meta': {'code': 200, 'requestId': '5fb9605592e219640d491b2b'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 7,
  'suggestedBounds': {'ne': {'lat': 43.7724803045, 'lng': -79.48104210959674},
   'sw': {'lat': 43.763480295499996, 'lng': -79.49348169040327}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4d3599d2eefa8cfa141d56b3',
       'name': 'MUSE Massage Spa',
       'location': {'address': '13-1290 Finch Ave. W.',
        'crossStreet': 'at Keele St.',
        'lat': 43.7656861872247,
        'lng': -79.4893183666046,
        'labeledLatLngs': [{'label': 'display',
       

#### Getting venues for Universities Neighbourhoods

In [106]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [107]:
University_venues = getNearbyVenues(names=University_data['Neighbourhood'],
                                   latitudes=University_data['Latitude'],
                                   longitudes=University_data['Longitude']
                                  )

Northwood Park, York University
University of Toronto, Harbord


In [108]:
print(University_venues.shape)
University_venues.head()

(41, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Northwood Park, York University",43.76798,-79.487262,MUSE Massage Spa,43.765686,-79.489318,Massage Studio
1,"Northwood Park, York University",43.76798,-79.487262,Carribean Heat,43.764155,-79.490227,Caribbean Restaurant
2,"Northwood Park, York University",43.76798,-79.487262,Tim Hortons,43.764289,-79.48879,Coffee Shop
3,"Northwood Park, York University",43.76798,-79.487262,Fox & Fiddle,43.763795,-79.488497,Bar
4,"Northwood Park, York University",43.76798,-79.487262,Bad Boy Furniture - North York,43.764314,-79.486588,Furniture / Home Store


In [109]:
University_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Northwood Park, York University",7,7,7,7,7,7
"University of Toronto, Harbord",34,34,34,34,34,34


In [110]:
print('There are {} uniques categories.'.format(len(University_venues['Venue Category'].unique())))

There are 29 uniques categories.


In [111]:
# one hot encoding
University_onehot = pd.get_dummies(University_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
University_onehot['Neighborhood'] = University_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [University_onehot.columns[-1]] + list(University_onehot.columns[:-1])
University_onehot = University_onehot[fixed_columns]

University_onehot.head()

Unnamed: 0,Neighborhood,Bakery,Bank,Bar,Beer Bar,Beer Store,Bookstore,Café,Caribbean Restaurant,Coffee Shop,College Arts Building,College Gym,Comfort Food Restaurant,Dessert Shop,French Restaurant,Furniture / Home Store,Italian Restaurant,Japanese Restaurant,Massage Studio,Metro Station,Miscellaneous Shop,Nightclub,Noodle House,Pub,Restaurant,Sandwich Place,Sushi Restaurant,Theater,Video Game Store,Yoga Studio
0,"Northwood Park, York University",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,"Northwood Park, York University",0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"Northwood Park, York University",0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"Northwood Park, York University",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,"Northwood Park, York University",0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [112]:
University_onehot.shape

(41, 30)

In [113]:
Universtity_grouped = University_onehot.groupby('Neighborhood').mean().reset_index()
Universtity_grouped

Unnamed: 0,Neighborhood,Bakery,Bank,Bar,Beer Bar,Beer Store,Bookstore,Café,Caribbean Restaurant,Coffee Shop,College Arts Building,College Gym,Comfort Food Restaurant,Dessert Shop,French Restaurant,Furniture / Home Store,Italian Restaurant,Japanese Restaurant,Massage Studio,Metro Station,Miscellaneous Shop,Nightclub,Noodle House,Pub,Restaurant,Sandwich Place,Sushi Restaurant,Theater,Video Game Store,Yoga Studio
0,"Northwood Park, York University",0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.142857,0.142857,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.142857,0.142857,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"University of Toronto, Harbord",0.058824,0.029412,0.058824,0.029412,0.029412,0.088235,0.147059,0.0,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.0,0.029412,0.058824,0.0,0.0,0.0,0.029412,0.029412,0.029412,0.029412,0.058824,0.029412,0.029412,0.029412,0.029412


In [114]:
Universtity_grouped.shape

(2, 30)

#### Reviewing Top 5 Venues

In [115]:
num_top_venues = 5

for hood in Universtity_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Universtity_grouped[Universtity_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Northwood Park, York University----
                    venue  freq
0  Furniture / Home Store  0.14
1                     Bar  0.14
2    Caribbean Restaurant  0.14
3             Coffee Shop  0.14
4      Miscellaneous Shop  0.14


----University of Toronto, Harbord----
            venue  freq
0            Café  0.15
1       Bookstore  0.09
2          Bakery  0.06
3             Bar  0.06
4  Sandwich Place  0.06




#### Obtain Data Frame with 10 most common venues

In [116]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [117]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
University_venues_sorted = pd.DataFrame(columns=columns)
University_venues_sorted['Neighborhood'] = Universtity_grouped['Neighborhood']

for ind in np.arange(Universtity_grouped.shape[0]):
    University_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Universtity_grouped.iloc[ind, :], num_top_venues)

University_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Northwood Park, York University",Furniture / Home Store,Bar,Caribbean Restaurant,Miscellaneous Shop,Metro Station,Massage Studio,Coffee Shop,Dessert Shop,Bank,Beer Bar
1,"University of Toronto, Harbord",Café,Bookstore,Bakery,Bar,Sandwich Place,Japanese Restaurant,French Restaurant,Bank,Beer Bar,Beer Store


In [118]:
University_venues_sorted1=University_venues_sorted.transpose(copy=True)
University_venues_sorted1.columns=University_venues_sorted1.iloc[0]
University_venues_sorted1.drop(University_venues_sorted1.index[0], inplace = True)
University_venues_sorted1

Neighborhood,"Northwood Park, York University","University of Toronto, Harbord"
1st Most Common Venue,Furniture / Home Store,Café
2nd Most Common Venue,Bar,Bookstore
3rd Most Common Venue,Caribbean Restaurant,Bakery
4th Most Common Venue,Miscellaneous Shop,Bar
5th Most Common Venue,Metro Station,Sandwich Place
6th Most Common Venue,Massage Studio,Japanese Restaurant
7th Most Common Venue,Coffee Shop,French Restaurant
8th Most Common Venue,Dessert Shop,Bank
9th Most Common Venue,Bank,Beer Bar
10th Most Common Venue,Beer Bar,Beer Store


In [119]:
University_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Northwood Park, York University",43.76798,-79.487262,MUSE Massage Spa,43.765686,-79.489318,Massage Studio
1,"Northwood Park, York University",43.76798,-79.487262,Carribean Heat,43.764155,-79.490227,Caribbean Restaurant
2,"Northwood Park, York University",43.76798,-79.487262,Tim Hortons,43.764289,-79.48879,Coffee Shop
3,"Northwood Park, York University",43.76798,-79.487262,Fox & Fiddle,43.763795,-79.488497,Bar
4,"Northwood Park, York University",43.76798,-79.487262,Bad Boy Furniture - North York,43.764314,-79.486588,Furniture / Home Store


#### Clustering

In [120]:
kclusters = 2

University_grouped_clustering = Universtity_grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(University_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 1])

In [123]:
# add clustering labels
University_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

University_merged = University_data

University_merged = University_merged.join(University_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

University_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3J,North York,"Northwood Park, York University",43.76798,-79.487262,0,Furniture / Home Store,Bar,Caribbean Restaurant,Miscellaneous Shop,Metro Station,Massage Studio,Coffee Shop,Dessert Shop,Bank,Beer Bar
1,M5S,Downtown Toronto,"University of Toronto, Harbord",43.662696,-79.400049,1,Café,Bookstore,Bakery,Bar,Sandwich Place,Japanese Restaurant,French Restaurant,Bank,Beer Bar,Beer Store


In [125]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(University_merged['Latitude'], University_merged['Longitude'], University_merged['Neighbourhood'], University_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters