# Capstone Project for Coursera IBM Data Science Professional Certificate

### Install the geocoder library

In [2]:
!conda install -c conda-forge geocoder 

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geocoder:   1.38.1-py_0  conda-forge
    orderedset: 2.0-py35_0   conda-forge
    ratelim:    0.1.6-py35_0 conda-forge

orderedset-2.0 100% |################################| Time: 0:00:00  48.44 MB/s
ratelim-0.1.6- 100% |################################| Time: 0:00:00  12.68 MB/s
geocoder-1.38. 100% |################################| Time: 0:00:00  43.99 MB/s


### import the libraries that are needed for the assignment

In [1]:
!conda install -c conda-forge folium=0.5.0 --yes # install folium library for mapping

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  52.20 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  34.65 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  37.85 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  47.49 MB/s


In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

import numpy as np # library to handle data in a vectorized manner

### Scrape the wikipedia webpage and parse the data using beautifulsoup. Records where borough was not assigned, were ommitted from the final dataframe.
### For records where neighborhood as not assigned, the borough name was assigned to the neighhorhood name per assignment instructions

In [4]:
#get data from wikipedia page
source=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})

# define the dataframe columns
column_names = ['Postcode','Borough', 'Neighborhood'] 

# instantiate the dataframe
Canada_neighborhoods = pd.DataFrame(columns=column_names) 

#loop through the table rows and columns to pull the data
table_body = table.find('tbody')
rows = table_body.find_all('tr')
column_count = 0
m_postcode = ""
m_borough = ""
m_neighborhood = ""

for tr in rows:
    cols = tr.find_all('td')
    for td in cols:
        if column_count == 0:
            m_postcode = td.text
            column_count = column_count + 1
        elif column_count == 1:
            m_borough = td.text
            column_count = column_count + 1
        else:
            m_neighborhood = td.text
            column_count = 0
    
       
    # replace neighborhood name with borough name if neighborhood name is not assigned
    if m_neighborhood == "Not assigned\n":
        m_neighborhood = m_borough
   
    # ignore records that have borough not assigned
    if m_postcode == "":
        pass
    else:
        if m_borough == "Not assigned":
            pass
        else:
            Canada_neighborhoods = Canada_neighborhoods.append({'Postcode': m_postcode,
                                                 'Borough': m_borough,
                                                'Neighborhood': m_neighborhood}, ignore_index=True)

#the neighborhood name came with a new line character '\n' so I had to clean it with this statement
Canada_neighborhoods['Neighborhood'] = Canada_neighborhoods['Neighborhood'].map(lambda x: x.rstrip('\n')) 
Canada_neighborhoods.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


### I made the assumption that the same boroughs will have the same postalcode, so I grouped by postal code and borough, and then concatencated the neighborhood name

In [5]:
New_Canada_neighborhoods = Canada_neighborhoods.groupby(['Postcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
New_Canada_neighborhoods.head(20)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [6]:
New_Canada_neighborhoods.shape

(103, 3)

## the geocoder is unable to get coordinates from google. It gets stuck in the while loop because it is not returning any values. so I opted to use the csv file.
import geocoder # import geocoder

#### initialize your variable to None

lat_lng_coords = None
postcode_lst = []
latitude_lst = []
longitude_lst = []

postcode_lst = New_Canada_neighborhoods['Postcode']
for postal_code in postcode_lst:
    # loop until you get the coordinates
    #lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        print(lat_lng_coords)

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    latitude_lst.append(latitude)
    longitude_lst.append(longitude)

#print(latitude_lst)

### Using the csv file instead

In [7]:
file_source = "http://cocl.us/Geospatial_data"
df_geospatial = pd.read_csv(file_source)
df_geospatial.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### merge our original data from wikipedia with the geospatial data from the csv file

In [8]:
Canada_neighborhoods_final = pd.merge(left=New_Canada_neighborhoods, right=df_geospatial, how='left', left_on='Postcode', right_on='Postal Code')
Canada_neighborhoods_final.drop(['Postal Code'], axis=1, inplace=True)
Canada_neighborhoods_final.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


### check the final size of new table

In [9]:
Canada_neighborhoods_final.shape

(103, 5)

Create a map of Toronto and superimpose markers for all the neighborhoods obtained from the table above

In [10]:
# create map of Toronto, Canada using latitude and longitude values
latitude = 43.653908
longitude = -79.384293

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Canada_neighborhoods_final['Latitude'], Canada_neighborhoods_final['Longitude'], Canada_neighborhoods_final['Borough'], Canada_neighborhoods_final['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Select data from main dataset that have boroughs with Toronto as part of their name. This reduces the number of records that will be returned from Foursquare and makes the graphs less cluttered.

In [11]:
Toronto_data = Canada_neighborhoods_final[Canada_neighborhoods_final['Borough'].str.contains('Toronto', na=False)].reset_index(drop=True)
Toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [12]:
Toronto_data.shape

(38, 5)

Create a new map, that superimposes markers for only boroughs with Toronto as part of their names

In [13]:
# create map of Toronto using latitude and longitude values
map_Toronto_sub = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map only in areas that have Toronto in the name of the borough
for lat, lng, label in zip(Toronto_data['Latitude'], Toronto_data['Longitude'], Toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto_sub)  
    
map_Toronto_sub

Set my client ID and client Secret for Foursquare. My Client ID and Client Secret are hidden in the next cell

In [14]:
{"tags":["removecell",]}
CLIENT_ID = 'JYTYMJLVZA51NAVNYFBKYDHJ3LBDJAZQ4VREN20QYY2VQFAZ' # your Foursquare ID
CLIENT_SECRET = 'TZ5V11SELHDESWQ5X1WH5550FTO3ZDHDOZISUZXQ1OOWBLVM' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: JYTYMJLVZA51NAVNYFBKYDHJ3LBDJAZQ4VREN20QYY2VQFAZ
CLIENT_SECRET:TZ5V11SELHDESWQ5X1WH5550FTO3ZDHDOZISUZXQ1OOWBLVM


Define a function to get nearby venues from foursquare. I am limiting the results output to 100

In [15]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    LIMIT=100
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Now call the function to get the nearby venues and save it in Toronto_venues

In [16]:
Toronto_venues = getNearbyVenues(names=Toronto_data['Neighborhood'],
                                   latitudes=Toronto_data['Latitude'],
                                   longitudes=Toronto_data['Longitude']
                                  )
print(Toronto_venues.shape)
Toronto_venues.head()

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The 

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
2,The Beaches,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


Now lets get a count of the nearby venues returned per neighborhood

In [17]:
Toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,58,58,58,58,58,58
"Brockton, Exhibition Place, Parkdale Village",22,22,22,22,22,22
Business Reply Mail Processing Centre 969 Eastern,17,17,17,17,17,17
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",14,14,14,14,14,14
"Cabbagetown, St. James Town",43,43,43,43,43,43
Central Bay Street,85,85,85,85,85,85
"Chinatown, Grange Park, Kensington Market",100,100,100,100,100,100
Christie,16,16,16,16,16,16
Church and Wellesley,86,86,86,86,86,86


In [18]:
# print out the number of unique categories per venue
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 240 uniques categories.


### Analyze each neighborhood

In [19]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()
#Toronto_onehot.shape

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [33]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.01
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0,0.071429,0.071429,0.071429,0.142857,0.142857,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.011765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.011765,0.0,0.0,0.011765,0.0,0.0,0.0
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.05,0.0,0.04,0.01,0.0,0.0,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.023256,0.0,0.011628,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.011628,0.011628,0.0,0.0,0.011628,0.0


Use the function next to sort the venues in descending order

In [34]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create a new dataframe, and use the function above to sort the top 10 venues and save them in the new dataframe

In [35]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Thai Restaurant,Café,Steakhouse,Bar,Hotel,American Restaurant,Bakery,Sushi Restaurant,Asian Restaurant
1,Berczy Park,Coffee Shop,Cocktail Bar,Cheese Shop,Bakery,Steakhouse,Seafood Restaurant,Farmers Market,Restaurant,Café,Pub
2,"Brockton, Exhibition Place, Parkdale Village",Breakfast Spot,Café,Coffee Shop,Yoga Studio,Bar,Burrito Place,Restaurant,Caribbean Restaurant,Climbing Gym,Pet Store
3,Business Reply Mail Processing Centre 969 Eastern,Auto Workshop,Garden,Brewery,Farmers Market,Spa,Light Rail Station,Fast Food Restaurant,Burrito Place,Restaurant,Recording Studio
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Terminal,Airport Service,Plane,Sculpture Garden,Boutique,Boat or Ferry,Harbor / Marina,Airport Gate,Airport


## Cluster the Neighborhoods

In [36]:
# set number of clusters
kclusters = 4

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int32)

Create a new dataframe containing the clusters labels and also the 10 most commone venues

In [37]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = Toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Toronto_merged.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,3,Health Food Store,Coffee Shop,Pub,Dim Sum Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,3,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Bookstore,Bakery,Sports Bar,Spa,Juice Bar,Brewery
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,3,Sandwich Place,Pet Store,Brewery,Food & Drink Shop,Italian Restaurant,Steakhouse,Fish & Chips Shop,Fast Food Restaurant,Liquor Store,Burger Joint
3,M4M,East Toronto,Studio District,43.659526,-79.340923,3,Café,Coffee Shop,Italian Restaurant,Bakery,Gastropub,American Restaurant,Sandwich Place,Stationery Store,Juice Bar,Fish Market
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Gym / Fitness Center,Park,Swim School,Bus Line,Women's Store,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store


Create a new map showing the clusters in color.

In [38]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine the clusters created
I run this for five clusters first and then for 4 clusters. I also noticed that when I run the clustering algorithm multiple times, the constituents of the clusters change each time. This will suggest that these clusters may not be very well defined and the characteristics that separate them may not be distinct enough.

#### Cluster 0:
This cluster could be mall space. It features stores and restaurants and the 3rd most common venue is a bus line, presumably to ferry people to and from the mall 

In [39]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 0, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Central Toronto,0,Gym / Fitness Center,Park,Swim School,Bus Line,Women's Store,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
10,Downtown Toronto,0,Park,Trail,Playground,Eastern European Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Women's Store


#### Cluster 1:
This could be a shopping center characterized by shops and restaurants.

In [40]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 1, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Central Toronto,1,Home Service,Garden,Women's Store,Diner,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store


#### Cluster 2:
This seems to be a neighborhood close to where people live because the most common venue are parks and playgrounds. It might also be a very culturally diverse neighborhood featuring restaurants dominated by international cuisines

In [41]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 2, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Central Toronto,2,Playground,Gym,Dim Sum Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


#### Cluster 3:
The most common venues for this cluster seems to be coffee shops/cafe. This cluster might represent a neighborhood with a lot of business's, so people might drop in the morning on their way to work to either grab a coffee or a quick breakfast

In [42]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 3, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,3,Health Food Store,Coffee Shop,Pub,Dim Sum Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
1,East Toronto,3,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Bookstore,Bakery,Sports Bar,Spa,Juice Bar,Brewery
2,East Toronto,3,Sandwich Place,Pet Store,Brewery,Food & Drink Shop,Italian Restaurant,Steakhouse,Fish & Chips Shop,Fast Food Restaurant,Liquor Store,Burger Joint
3,East Toronto,3,Café,Coffee Shop,Italian Restaurant,Bakery,Gastropub,American Restaurant,Sandwich Place,Stationery Store,Juice Bar,Fish Market
5,Central Toronto,3,Hotel,Breakfast Spot,Burger Joint,Dance Studio,Food & Drink Shop,Clothing Store,Sandwich Place,Gym,Park,Electronics Store
6,Central Toronto,3,Sporting Goods Shop,Coffee Shop,Yoga Studio,Diner,Dessert Shop,Italian Restaurant,Rental Car Location,Sandwich Place,Mexican Restaurant,Chinese Restaurant
7,Central Toronto,3,Sandwich Place,Dessert Shop,Restaurant,Coffee Shop,Café,Pizza Place,Italian Restaurant,Sushi Restaurant,Gourmet Shop,Fried Chicken Joint
9,Central Toronto,3,Coffee Shop,Pub,Pizza Place,American Restaurant,Light Rail Station,Medical Center,Sports Bar,Supermarket,Sushi Restaurant,Fried Chicken Joint
11,Downtown Toronto,3,Coffee Shop,Restaurant,Café,Bakery,Pizza Place,Italian Restaurant,Pub,Gift Shop,Beer Store,Bank
12,Downtown Toronto,3,Japanese Restaurant,Coffee Shop,Restaurant,Sushi Restaurant,Burger Joint,Gay Bar,Pub,Bubble Tea Shop,Gym,Yoga Studio
