# Question 1:
## 1.1. Notebook book created
import the basic dependencies. 

In [1]:
import numpy as np 
import pandas as pd
import requests # Library for web scraping

print('Libraries imported.')

Libraries imported.


## 1.2. Web page scraped
About the Data, Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M.
   - is a list of postal codes in Canada where the first letter is M. Postal codes beginning with M are located within the city of Toronto in the province of Ontario.
   - Scraping table from HTML using BeautifulSoup

In [2]:
# To run this, you can install BeautifulSoup
# https://pypi.python.org/pypi/beautifulsoup4

# Or download the file
# http://beautiful-soup-4
# and unzip it in the same directory as this file
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import csv

print('BeautifulSoup  & csv imported.')

BeautifulSoup  & csv imported.


In [3]:
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

print('SSL certificate errors ignored.')

SSL certificate errors ignored.


In [7]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
#soup ready
table = soup.find('table',{'class':'wikitable sortable'})
#table
table_rows = table.find_all('tr')
#table_rows
data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])


## 1.3. Data transformed into pandas dataframe

In [10]:
df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df = df[~df['PostalCode'].isnull()]  # to filter out bad rows
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 180 entries, 1 to 180
Data columns (total 3 columns):
PostalCode       180 non-null object
Borough          180 non-null object
Neighbourhood    180 non-null object
dtypes: object(3)
memory usage: 5.6+ KB


In [9]:
df.shape

(180, 3)

## 1.4. Dataframe cleaned and notebook annotate
Only process the cells that have an assigned borough, we can ignore cells with 'Not assigned' boroughs, like in rows 1 & 2

In [11]:
df = df[~df['PostalCode'].isnull()]  # to filter out bad rows
df.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)
df1 = df.reset_index()
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 4 columns):
index            103 non-null int64
PostalCode       103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
dtypes: int64(1), object(3)
memory usage: 3.3+ KB


In [12]:
df1.shape

(103, 4)

In [15]:
df1.head()

Unnamed: 0,index,PostalCode,Borough,Neighbourhood
0,3,M3A,North York,Parkwoods
1,4,M4A,North York,Victoria Village
2,5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,6,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


More than one neighborhood can exist in one postal code area, M5A is listed twice and has two neighborhoods Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma using **groupby**

In [13]:
df2= df1.groupby('PostalCode').agg(lambda x: ','.join(x))
df2.head()

Unnamed: 0_level_0,Borough,Neighbourhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [14]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 103 entries, M1B to M9W
Data columns (total 2 columns):
Borough          103 non-null object
Neighbourhood    103 non-null object
dtypes: object(2)
memory usage: 2.4+ KB


In [16]:
df2.shape

(103, 2)

There are also cells that have an assigned neighbouhoods,like M7A, lets assign their boroughs as their neighbourhood, as follows:

In [96]:
df2.loc[df2['Neighbourhood']=="Not assigned",'Neighbourhood']=df2.loc[df2['Neighbourhood']=="Not assigned",'Borough']
df3 = df2.reset_index()

Now we can remove the duplicate boroughts as follows:

In [97]:
df3['Borough']= df3['Borough'].str.replace('nan|[{}\s]','').str.split(',').apply(set).str.join(',').str.strip(',').str.replace(",{2,}",",")

In [98]:
df3.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [99]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
PostalCode       103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
dtypes: object(3)
memory usage: 2.5+ KB


In [100]:
df3.shape

(103, 3)

# Question 2:
## Used the Geocoder Package to get the coordinates of neighborhoods

In [76]:
from  geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="my-application")

In [114]:
df4=df3.drop('Neighbourhood', axis=1).join(df3['Neighbourhood'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('Neighbourhood'))
df4=df4.reset_index(drop=True)

In [115]:
df4['Address']=df4['PostalCode'] + ',' + df4['Borough'] + ','+ df4['Neighbourhood']
df4['Coordinates'] =df4['Address'].apply(geolocator.geocode)
df4.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Address,Coordinates
0,M1B,Scarborough,Malvern,"M1B,Scarborough,Malvern",
1,M1B,Scarborough,Rouge,"M1B,Scarborough, Rouge","(Rouge, Scarborough—Rouge Park, Scarborough, T..."
2,M1C,Scarborough,Rouge Hill,"M1C,Scarborough,Rouge Hill","(Rouge Hill, Scarborough—Rouge Park, Scarborou..."
3,M1C,Scarborough,Port Union,"M1C,Scarborough, Port Union","(Port Union, Scarborough—Rouge Park, Scarborou..."
4,M1C,Scarborough,Highland Creek,"M1C,Scarborough, Highland Creek","(Highland Creek, Scarborough—Rouge Park, Scarb..."


In [117]:
df4['Latitude']=df4['Coordinates'].apply(lambda x: x.latitude if x !=None else None)
df4['Longitude']=df4['Coordinates'].apply(lambda x: x.longitude if x !=None else None)

In [119]:
columnlist = ['PostalCode',
 'Borough',
 'Neighbourhood',
 'Latitude',
 'Longitude']
df4=df4[columnlist]
df4

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Malvern,,
1,M1B,Scarborough,Rouge,43.804930,-79.165837
2,M1C,Scarborough,Rouge Hill,43.795019,-79.135104
3,M1C,Scarborough,Port Union,43.775504,-79.134976
4,M1C,Scarborough,Highland Creek,43.790117,-79.173334
...,...,...,...,...,...
204,M9V,Etobicoke,Mount Olive,,
205,M9V,Etobicoke,Beaumond Heights,,
206,M9V,Etobicoke,Thistletown,,
207,M9V,Etobicoke,Albion Gardens,,


In [138]:
Neighbourhoods=df4[df4['Latitude'].notna()].reset_index(drop=True)
Neighbourhoods

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Rouge,43.80493,-79.165837
1,M1C,Scarborough,Rouge Hill,43.795019,-79.135104
2,M1C,Scarborough,Port Union,43.775504,-79.134976
3,M1C,Scarborough,Highland Creek,43.790117,-79.173334
4,M1G,Scarborough,Woburn,43.759824,-79.225291
5,M1W,Scarborough,Steeles West,43.816178,-79.314538
6,M1W,Scarborough,L'Amoreaux West,43.799003,-79.305967
7,M6N,York,Runnymede,43.66558,-79.482108
8,M6N,York,The Junction North,42.928672,-78.026122
9,M9C,Etobicoke,Eringate,43.662273,-79.576516


In [149]:
Neighbourhoods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 5 columns):
PostalCode       17 non-null object
Borough          17 non-null object
Neighbourhood    17 non-null object
Latitude         17 non-null float64
Longitude        17 non-null float64
dtypes: float64(2), object(3)
memory usage: 808.0+ bytes


# Question 3:
## Explore and cluster the neighborhoods in Toronto. 

In [140]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


#### Create a map of Toronto with neighborhoods superimposed on top.

In [146]:
address = 'Toronto, ON, Canada'
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 43.6534817, -79.3839347.


In [155]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Neighbourhoods['Latitude'], Neighbourhoods['Longitude'], Neighbourhoods['Borough'], Neighbourhoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Let's simplify the above map and segment and cluster only the neighborhoods in Scarborough. 
### So let's slice the original dataframe and create a new dataframe of the Scarborough data.

In [158]:
scarborough_data = Neighbourhoods[Neighbourhoods['Borough'] == 'Scarborough'].reset_index(drop=True)
scarborough_data.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Rouge,43.80493,-79.165837
1,M1C,Scarborough,Rouge Hill,43.795019,-79.135104
2,M1C,Scarborough,Port Union,43.775504,-79.134976
3,M1C,Scarborough,Highland Creek,43.790117,-79.173334
4,M1G,Scarborough,Woburn,43.759824,-79.225291


In [266]:
address = 'Scarborough, Toronto'
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 43.773077, -79.257774.


#### Visualizat Scarborough the neighborhoods.

In [268]:
# create map of Manhattan using latitude and longitude values
map_scarborough = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(scarborough_data['Latitude'], scarborough_data['Longitude'], scarborough_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_scarborough)  
    
map_scarborough


#### Now, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.
#### Define Foursquare Credentials and Version

In [269]:
CLIENT_ID = 'NZQGQHN3EFD4MUMQQNX5EJ0SMFJDG2F5FCOSMMNESYZEJXF3' # your Foursquare ID
CLIENT_SECRET = 'H4Y1RK5PSVGWRAPSI12A1NNKP004TQZHSKEUVLEZZKZHNVHB' # your Foursquare Secret
VERSION = '20180604'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: NZQGQHN3EFD4MUMQQNX5EJ0SMFJDG2F5FCOSMMNESYZEJXF3
CLIENT_SECRET:H4Y1RK5PSVGWRAPSI12A1NNKP004TQZHSKEUVLEZZKZHNVHB


## 1.Let's explore the first neighborhood in our dataframe

In [270]:
scarborough_data.loc[0, 'Neighborhood']

' Rouge'

In [271]:
neighborhood_latitude = scarborough_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = scarborough_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = scarborough_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of  Rouge are 43.8049304, -79.1658374.


#### Now, let's get the top 100 venues that are in Rouge within a radius of 1000 meters.

In [272]:
# type your answer here
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 1000 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?client_id=NZQGQHN3EFD4MUMQQNX5EJ0SMFJDG2F5FCOSMMNESYZEJXF3&client_secret=H4Y1RK5PSVGWRAPSI12A1NNKP004TQZHSKEUVLEZZKZHNVHB&ll=43.8049304,-79.1658374&v=20180604&radius=1000&limit=100'

In [173]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5eb71e776d8c56001b8cdaf9'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Rouge',
  'headerFullLocation': 'Rouge, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 11,
  'suggestedBounds': {'ne': {'lat': 43.813930409000015,
    'lng': -79.15339012734938},
   'sw': {'lat': 43.79593039099999, 'lng': -79.17828467265062}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4c77d56293faa093bbfcf1fb',
       'name': 'Glen Rouge Campground',
       'location': {'address': '7450 Kingston Rd.',
        'lat': 43.80316668151155,
        'lng': -79.15518309247824,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.8

we know that all the information is in the items key. Before we proceed, let's borrow the get_category_type function from the Foursquare lab.

In [273]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [274]:
# Now we are ready to clean the json and structure it into a pandas dataframe.

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Glen Rouge Campground,Campground,43.803167,-79.155183
1,Dean Park,Park,43.804364,-79.169159
2,Petro-Canada,Gas Station,43.807831,-79.171431
3,Paul's Breakfast & Burgers,Fast Food Restaurant,43.803835,-79.169825
4,Hwy 401 at Meadowvale,Intersection,43.805066,-79.173271


In [275]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

11 venues were returned by Foursquare.


## 2. Explore Neighborhoods in Scarborough
Let's create a function to repeat the same process to all the neighborhoods in Scarborough

In [276]:
def getNearbyVenues(names, latitudes, longitudes, radius=2000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [277]:
# run the above function on each neighborhood and create a new dataframe called scarborough_venues.

scarborough_venues = getNearbyVenues(names=scarborough_data['Neighborhood'],
                                   latitudes=scarborough_data['Latitude'],
                                   longitudes=scarborough_data['Longitude']
                                  )


 Rouge
Rouge Hill
 Port Union
 Highland Creek
Woburn
Steeles West
 L'Amoreaux West


In [278]:
print(scarborough_venues.shape)
scarborough_venues.head()

(356, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rouge,43.80493,-79.165837,Glen Rouge Campground,43.803167,-79.155183,Campground
1,Rouge,43.80493,-79.165837,"Lamanna's Bakery, Cafe & Fine Foods",43.797971,-79.148432,Bakery
2,Rouge,43.80493,-79.165837,Dean Park,43.804364,-79.169159,Park
3,Rouge,43.80493,-79.165837,Shoppers Drug Mart,43.797156,-79.151061,Pharmacy
4,Rouge,43.80493,-79.165837,Rouge National Urban Park,43.818747,-79.170414,National Park


Let's check how many venues were returned for each neighborhood

In [279]:
scarborough_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Highland Creek,20,20,20,20,20,20
L'Amoreaux West,69,69,69,69,69,69
Port Union,18,18,18,18,18,18
Rouge,29,29,29,29,29,29
Rouge Hill,53,53,53,53,53,53
Steeles West,100,100,100,100,100,100
Woburn,67,67,67,67,67,67


Let's find out how many unique categories can be curated from all the returned venues

In [280]:
print('There are {} uniques categories.'.format(len(scarborough_venues['Venue Category'].unique())))

There are 92 uniques categories.


## 3. Analyze Each Neighborhood

In [281]:
# # one hot encoding
scarborough_onehot = pd.get_dummies(scarborough_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
scarborough_onehot=scarborough_onehot.drop(columns=['Neighborhood'])
scarborough_onehot['Neighborhood'] = scarborough_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [scarborough_onehot.columns[-1]] + list(scarborough_onehot.columns[:-1])
scarborough_onehot = scarborough_onehot[fixed_columns]
scarborough_onehot.head()

Unnamed: 0,Neighborhood,Art Gallery,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,Bank,Baseball Field,Beach,Beer Store,...,Tea Room,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Xinjiang Restaurant,Zoo Exhibit
0,Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Rouge,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [282]:
scarborough_grouped = scarborough_onehot.groupby('Neighborhood').mean().reset_index()
scarborough_grouped

Unnamed: 0,Neighborhood,Art Gallery,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,Bank,Baseball Field,Beach,Beer Store,...,Tea Room,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Xinjiang Restaurant,Zoo Exhibit
0,Highland Creek,0.0,0.0,0.1,0.0,0.0,0.05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,L'Amoreaux West,0.0,0.0,0.0,0.014493,0.057971,0.028986,0.014493,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.014493,0.057971,0.0,0.0
2,Port Union,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.055556,...,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0
3,Rouge,0.0,0.0,0.0,0.0,0.034483,0.034483,0.0,0.0,0.0,...,0.0,0.0,0.0,0.103448,0.0,0.0,0.0,0.0,0.0,0.068966
4,Rouge Hill,0.0,0.0,0.0,0.0,0.018868,0.037736,0.0,0.018868,0.018868,...,0.0,0.0,0.0,0.037736,0.018868,0.0,0.0,0.0,0.0,0.018868
5,Steeles West,0.0,0.03,0.0,0.01,0.03,0.03,0.01,0.0,0.0,...,0.03,0.0,0.0,0.0,0.0,0.02,0.0,0.03,0.01,0.0
6,Woburn,0.014925,0.0,0.014925,0.0,0.0,0.059701,0.0,0.0,0.029851,...,0.0,0.014925,0.014925,0.0,0.0,0.0,0.0,0.014925,0.014925,0.0


In [283]:
scarborough_grouped.shape

(7, 92)

### Let's print each neighborhood along with the top 5 most common venues

In [284]:
num_top_venues = 5

for hood in scarborough_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = scarborough_grouped[scarborough_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---- Highland Creek----
                venue  freq
0         Gas Station  0.15
1        Burger Joint  0.10
2  Athletics & Sports  0.10
3         Coffee Shop  0.10
4                 Pub  0.05


---- L'Amoreaux West----
                   venue  freq
0     Chinese Restaurant  0.17
1            Coffee Shop  0.10
2                 Bakery  0.06
3  Vietnamese Restaurant  0.06
4           Dessert Shop  0.06


---- Port Union----
               venue  freq
0               Park  0.11
1          Pet Store  0.06
2        Pizza Place  0.06
3  Food & Drink Shop  0.06
4        Supermarket  0.06


---- Rouge----
         venue  freq
0        Trail  0.10
1  Coffee Shop  0.10
2  Zoo Exhibit  0.07
3         Park  0.07
4  Gas Station  0.07


----Rouge Hill----
            venue  freq
0            Park  0.08
1     Coffee Shop  0.08
2             Pub  0.06
3   Shopping Mall  0.04
4  Sandwich Place  0.04


----Steeles West----
                  venue  freq
0    Chinese Restaurant  0.19
1           Coffee S

#### Let's put that into a pandas dataframe
#### First, let's write a function to sort the venues in descending order.

In [285]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [286]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = scarborough_grouped['Neighborhood']

for ind in np.arange(scarborough_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(scarborough_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Highland Creek,Gas Station,Burger Joint,Coffee Shop,Athletics & Sports,Italian Restaurant,Pharmacy,Pizza Place,Pub,Fish & Chips Shop,Breakfast Spot
1,L'Amoreaux West,Chinese Restaurant,Coffee Shop,Dessert Shop,Bakery,Vietnamese Restaurant,Korean Restaurant,Park,Bubble Tea Shop,Bank,Gas Station
2,Port Union,Park,Cosmetics Shop,Pizza Place,Pharmacy,Sandwich Place,Japanese Restaurant,Pet Store,Supermarket,Fast Food Restaurant,Beer Store
3,Rouge,Trail,Coffee Shop,Zoo Exhibit,Park,Gas Station,Bakery,Ice Cream Shop,Pharmacy,Campground,Pizza Place
4,Rouge Hill,Coffee Shop,Park,Pub,Fast Food Restaurant,Breakfast Spot,Gas Station,Trail,Pharmacy,Bank,Pizza Place


## 4. Cluster Neighborhoods
#### Run k-means to cluster the neighborhood into 5 clusters.

In [287]:
# set number of clusters
kclusters = 3

scarborough_grouped_clustering = scarborough_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(scarborough_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 0, 2, 2, 2, 0, 2], dtype=int32)

In [288]:
# Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
neighborhoods_venues_sorted

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,Highland Creek,Gas Station,Burger Joint,Coffee Shop,Athletics & Sports,Italian Restaurant,Pharmacy,Pizza Place,Pub,Fish & Chips Shop,Breakfast Spot
1,0,L'Amoreaux West,Chinese Restaurant,Coffee Shop,Dessert Shop,Bakery,Vietnamese Restaurant,Korean Restaurant,Park,Bubble Tea Shop,Bank,Gas Station
2,2,Port Union,Park,Cosmetics Shop,Pizza Place,Pharmacy,Sandwich Place,Japanese Restaurant,Pet Store,Supermarket,Fast Food Restaurant,Beer Store
3,2,Rouge,Trail,Coffee Shop,Zoo Exhibit,Park,Gas Station,Bakery,Ice Cream Shop,Pharmacy,Campground,Pizza Place
4,2,Rouge Hill,Coffee Shop,Park,Pub,Fast Food Restaurant,Breakfast Spot,Gas Station,Trail,Pharmacy,Bank,Pizza Place
5,0,Steeles West,Chinese Restaurant,Coffee Shop,Sandwich Place,Hong Kong Restaurant,Korean Restaurant,Bubble Tea Shop,Vietnamese Restaurant,Asian Restaurant,Fast Food Restaurant,Bakery
6,2,Woburn,Coffee Shop,Fast Food Restaurant,Pizza Place,Bank,Sandwich Place,Chinese Restaurant,Pharmacy,Park,Beer Store,Big Box Store


In [289]:
scarborough_merged = scarborough_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
scarborough_merged = scarborough_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

scarborough_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,Rouge,43.80493,-79.165837,2,Trail,Coffee Shop,Zoo Exhibit,Park,Gas Station,Bakery,Ice Cream Shop,Pharmacy,Campground,Pizza Place
1,M1C,Scarborough,Rouge Hill,43.795019,-79.135104,2,Coffee Shop,Park,Pub,Fast Food Restaurant,Breakfast Spot,Gas Station,Trail,Pharmacy,Bank,Pizza Place
2,M1C,Scarborough,Port Union,43.775504,-79.134976,2,Park,Cosmetics Shop,Pizza Place,Pharmacy,Sandwich Place,Japanese Restaurant,Pet Store,Supermarket,Fast Food Restaurant,Beer Store
3,M1C,Scarborough,Highland Creek,43.790117,-79.173334,1,Gas Station,Burger Joint,Coffee Shop,Athletics & Sports,Italian Restaurant,Pharmacy,Pizza Place,Pub,Fish & Chips Shop,Breakfast Spot
4,M1G,Scarborough,Woburn,43.759824,-79.225291,2,Coffee Shop,Fast Food Restaurant,Pizza Place,Bank,Sandwich Place,Chinese Restaurant,Pharmacy,Park,Beer Store,Big Box Store
5,M1W,Scarborough,Steeles West,43.816178,-79.314538,0,Chinese Restaurant,Coffee Shop,Sandwich Place,Hong Kong Restaurant,Korean Restaurant,Bubble Tea Shop,Vietnamese Restaurant,Asian Restaurant,Fast Food Restaurant,Bakery
6,M1W,Scarborough,L'Amoreaux West,43.799003,-79.305967,0,Chinese Restaurant,Coffee Shop,Dessert Shop,Bakery,Vietnamese Restaurant,Korean Restaurant,Park,Bubble Tea Shop,Bank,Gas Station


#### Finally, let's visualize the resulting clusters

In [290]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(scarborough_merged['Latitude'], scarborough_merged['Longitude'], scarborough_merged['Neighborhood'], scarborough_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters
#### examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, assign a name to each cluster. 

In [291]:

scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 0, scarborough_merged.columns[[2] + list(range(5, scarborough_merged.shape[1]))]]


Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Steeles West,0,Chinese Restaurant,Coffee Shop,Sandwich Place,Hong Kong Restaurant,Korean Restaurant,Bubble Tea Shop,Vietnamese Restaurant,Asian Restaurant,Fast Food Restaurant,Bakery
6,L'Amoreaux West,0,Chinese Restaurant,Coffee Shop,Dessert Shop,Bakery,Vietnamese Restaurant,Korean Restaurant,Park,Bubble Tea Shop,Bank,Gas Station


In [292]:
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 1, scarborough_merged.columns[[2] + list(range(5, scarborough_merged.shape[1]))]]


Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Highland Creek,1,Gas Station,Burger Joint,Coffee Shop,Athletics & Sports,Italian Restaurant,Pharmacy,Pizza Place,Pub,Fish & Chips Shop,Breakfast Spot


In [293]:
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 2, scarborough_merged.columns[[2] + list(range(5, scarborough_merged.shape[1]))]]


Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Rouge,2,Trail,Coffee Shop,Zoo Exhibit,Park,Gas Station,Bakery,Ice Cream Shop,Pharmacy,Campground,Pizza Place
1,Rouge Hill,2,Coffee Shop,Park,Pub,Fast Food Restaurant,Breakfast Spot,Gas Station,Trail,Pharmacy,Bank,Pizza Place
2,Port Union,2,Park,Cosmetics Shop,Pizza Place,Pharmacy,Sandwich Place,Japanese Restaurant,Pet Store,Supermarket,Fast Food Restaurant,Beer Store
4,Woburn,2,Coffee Shop,Fast Food Restaurant,Pizza Place,Bank,Sandwich Place,Chinese Restaurant,Pharmacy,Park,Beer Store,Big Box Store


 #### Based on the defining categories, assign a name to each cluster.  
   - **0.Food area:** Steeles West,L'Amoreaux West
   - **1.Utility area:** Highland Creek
   - **2.Recration area:** Rouge,Rouge Hill,Port Union,Woburn