# Assignment for Coursera_Capstone

### Import Necessary libraries

In [3]:
## import necessary libraries
import pandas as pd
import numpy as np
import sklearn 
import matplotlib as mpl
import requests
from bs4 import BeautifulSoup

### Scrape datasets from online
We need to present the datasets in the format of dataframe for further operations

In [4]:
## scrape datasets from website 
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content, 'lxml')
table = soup.find_all('table')[0]
Toronto_data_raw = pd.read_html(str(table))
Toronto_data1 = Toronto_data_raw[0]
Toronto_data1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Set the column names

In [5]:
## clean the dataset and set it into a dataframe format
Toronto_data = Toronto_data1.drop([0],axis = 0)
Toronto_data = Toronto_data.reset_index()
Toronto_data.columns = ['index','Postcode', 'Borough','Neighborhood']
Toronto_data = Toronto_data.drop(['index'], axis = 1)
Toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M2A,Not assigned,Not assigned
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M5A,Downtown Toronto,Regent Park


### Clean data with unassigned borough values

In [6]:
## Ignore rows with unassigned borough values 
Toronto_data = Toronto_data[Toronto_data['Borough'] != 'Not assigned']

Toronto_data.shape

(211, 3)

### Format data with same postcode but different neighborhoods

In [7]:
## use another dataframe for the operations
Toronto_try = Toronto_data

## Test if there are repeating rows in the dataframe
for i in range(211):
    for j in range(211):
        if i != j and Toronto_try.iloc[i]['Postcode'] ==  Toronto_try.iloc[j]['Postcode'] and Toronto_try.iloc[i]['Neighborhood'] == Toronto_try.iloc[j]['Neighborhood']:
            print (i,j)

In [9]:
## combine the neighborhoods for the same postcode value
for i in range(1, 211):
    if Toronto_try.iloc[i]['Postcode'] ==  Toronto_try.iloc[i-1]['Postcode'] and Toronto_try.iloc[i]['Neighborhood'] != Toronto_try.iloc[i-1]['Neighborhood']:
        Toronto_try.iloc[i]['Neighborhood'] = str(Toronto_try.iloc[i-1]['Neighborhood']) + ', ' + Toronto_try.iloc[i]['Neighborhood'] 

## reset index 
Toronto_try1 = Toronto_try.reset_index()
Toronto_try1 = Toronto_try1.drop(['index'], axis = 1)

## drop redundant rows
redundant_rows = []
for i in range(1, 211):
    if Toronto_try1.iloc[i]['Postcode'] == Toronto_try1.iloc[i-1]['Postcode']:
        redundant_rows.append(i-1) 

## reset index for the new dataframe again
Toronto_try2 = Toronto_try1.drop(redundant_rows)
Toronto_try2 = Toronto_try2.reset_index()  
Toronto_try2 = Toronto_try2.drop(['index'], axis = 1)
Toronto_try2.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


### Format missing data of neighborhood with borough value

In [10]:
## use row iterations to fill in the missing data of neighborhood
for index, row in Toronto_try2.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']
## use another explainative dataframe name to represent the final data 
Toronto_final = Toronto_try2
Toronto_final.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


### Shape of the dataframe 

In [11]:
Toronto_final.shape

(103, 3)

### Obtain Postcode Data (Second Qn Week 3)
Since the code involving geocoder takes too long to run and frequently encounters errors, we are extracting postcode data from the csv file provided

In [12]:
## Extract postcode data from csv files
postcode = pd.read_csv("http://cocl.us/Geospatial_data")
postcode.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge Toronto_final data and postcode data into one dataframe

In [13]:
## Test on the shape of the postal code data
## make sure it fits the original Toronto_final dataframe
postcode.shape

(103, 3)

In [14]:
## Add two columns ('Latitide', 'Longitude') to the dataframe 
Toronto_final.insert(3, "Latitude", np.zeros(103), True)
Toronto_final.insert(4, 'Longitude', np.zeros(103), True)
Toronto_final.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,0.0,0.0
1,M4A,North York,Victoria Village,0.0,0.0
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",0.0,0.0
3,M6A,North York,"Lawrence Heights, Lawrence Manor",0.0,0.0
4,M7A,Queen's Park,Queen's Park,0.0,0.0


In [15]:
## Use iterations to combine data from two dataframes
for index, row in Toronto_final.iterrows():
    for j in range(103):
        if row['Postcode'] == postcode.iloc[j]['Postal Code']:
            Toronto_final.at[index, 'Latitude'] = postcode.iloc[j]['Latitude']
            Toronto_final.at[index, 'Longitude'] = postcode.iloc[j]['Longitude']
Toronto_final.head(10) 

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


## Neighborhood Cluster Analysis (Third Qn Week 4)

### Select Appropriate Datasets for Analysis
For the convenience of analysis, we will analyze neighborhoods of boroughs that contain "Toronto": Central/Downtown/East/West Toronto

In [16]:
## check different types of boroughs
set(Toronto_final['Borough'])

{'Central Toronto',
 'Downtown Toronto',
 'East Toronto',
 'East York',
 'Etobicoke',
 'Mississauga',
 'North York',
 "Queen's Park",
 'Scarborough',
 'West Toronto',
 'York'}

In [17]:
## Filter out neighborhoods in C/E/W/D Toronto
Toronto_selected = Toronto_final[Toronto_final['Borough'].str.contains("Toronto")]
Toronto_selected = Toronto_selected.reset_index()
Toronto_selected = Toronto_selected.drop(['index'], axis = 1)
Toronto_selected.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


### Visualize the map of Toronto
We will visualize the map of Toronto with data of Toronto_selected

In [18]:
## get latitude & longitude values for Toronto
import geopy
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent = "Toronto_explorer")
location = geolocator.geocode("Toronto")
lat = location.latitude
lng = location.longitude

In [19]:
## Install relevant packages for visualization
!conda install -c conda-forge folium=0.5.0 --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-3.2.0               |           py36_0         770 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.3 MB

The following NEW packages will be 

In [54]:
## plot the map for Toronto with latitude & longitude values
import folium
map_trt = folium.Map(location = [lat, lng], zoom_start = 12)

## add markers to map
for lati, lngi, borough, postcode in zip(Toronto_selected['Latitude'], Toronto_selected['Longitude'], Toronto_selected['Borough'], Toronto_selected['Postcode']):
    label = '{}, {}'.format(borough, postcode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lati, lngi],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.6,
        parse_html = False
    ).add_to(map_trt)
    
map_trt

### Analysis for one Neighborhood as an Example -- M5A
We will select M5A as an example to illustrate how we obtain the information of different venues in a neighborhood and present the information in a dataframe

In [21]:
## import necessary libraries
import requests

## Obtain the coordinates for M5A
M5A_LA = Toronto_selected.iloc[0]['Latitude']
M5A_LN = Toronto_selected.iloc[0]['Longitude']

## Client Information for Foursquare
CLIENT_ID = "331QIXI5YMKN20D3VHINKUKY5SRMBZHMS5S2WLB0YATWN21J"
CLIENT_SECRET = "KSDVCZ554PWN1244NW4FF4OF3H10FJRI5AZ0WLRBH3X4SY1X"
VERSION = '20190828'

## Set Radius & LIMIT
LIMIT = 100
radius = 500

url_M5A = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
      CLIENT_ID,
      CLIENT_SECRET,
      VERSION,
      M5A_LA,
      M5A_LN,
      radius,
      LIMIT
)
results_M5A = requests.get(url_M5A).json()

In [22]:
## Create a function that extracts the category of the different venues in the neighborhood
def get_category(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [23]:
## import necessary lib
from pandas.io.json import json_normalize

## transform results for M5A into a dataframe with near venues in M5A neighborhood
venues_M5A = results_M5A['response']['groups'][0]['items']
nearby_venues = json_normalize(venues_M5A)

## filter columns
nearby_venues_M5A =  nearby_venues.loc[:, ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']]

## format the category type for each venue/row
nearby_venues_M5A['venue.categories'] = nearby_venues_M5A.apply(get_category, axis = 1)

## clear up the column names
nearby_venues_M5A.columns = [col.split(".")[-1] for col in nearby_venues_M5A.columns]

nearby_venues_M5A.head()

Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Toronto Cooper Koo Family Cherry St YMCA Centre,Gym / Fitness Center,43.653191,-79.357947
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Impact Kitchen,Restaurant,43.656369,-79.35698


### Analysis for All Neighborhoods in Toronto
#### Obtain the dataframe for all Neighborhoods with venues
We will generalize the analysis/dataframe of M5A neighborhood to all neighborhoods in Toronto and merge all the data into one big dataset for analysis

In [24]:
## Create a function to repeat process for all neighborhoods
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return nearby_venues

In [25]:
## Fill in the data of different neighborhoods in Toronto
Toronto_venues = getNearbyVenues(names = Toronto_selected['Postcode'], latitudes = Toronto_selected['Latitude'], longitudes = Toronto_selected['Longitude'])
Toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M5A,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,43.65426,-79.360636,Toronto Cooper Koo Family Cherry St YMCA Centre,43.653191,-79.357947,Gym / Fitness Center
3,M5A,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


### Explore the Venues in Toronto
After obtaining the venues information in Toronto as above, we will analyze their features and their possible relations with neighborhoods

In [26]:
## Apply one hot encoding to Venue Categories for easier analysis
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix = "", prefix_sep= "")

## Add neighborhood column back to dataframe
Toronto_onehot[['Postcode']] = Toronto_venues[['Neighborhood']]

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Postcode,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
## Check out the size of the dataset
Toronto_onehot.shape

(1710, 242)

In [27]:
## frequency of occurrence of each category for each neighborhood
Toronto_grouped = Toronto_onehot.groupby('Postcode').mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Postcode,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,...,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.023256
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Find Out the Top 10 Venues for each Neighborhood
We will present the results in a dataframe for viewing

In [28]:
## Define a function that sorts the values in rows

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [29]:
## create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(10):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

## Create a dataframe
## set up the column names for the dataframe
Toronto_venue_sorted = pd.DataFrame(columns = columns)

## set the column of "Postcode" 
Toronto_venue_sorted['Postcode'] = Toronto_grouped['Postcode']

## Set the other column values -- the top 10 venue names
for ind in np.arange(Toronto_grouped.shape[0]):
    Toronto_venue_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], 10)

Toronto_venue_sorted.head()

Unnamed: 0,Postcode,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Health Food Store,Neighborhood,Coffee Shop,Trail,Pub,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop
1,M4K,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Ice Cream Shop,Pub,Bookstore,Brewery,Bubble Tea Shop,Burger Joint
2,M4L,Pizza Place,Sandwich Place,Pet Store,Brewery,Food & Drink Shop,Steakhouse,Fish & Chips Shop,Light Rail Station,Board Shop,Liquor Store
3,M4M,Café,Coffee Shop,Italian Restaurant,Bakery,American Restaurant,Yoga Studio,Comfort Food Restaurant,Seafood Restaurant,Sandwich Place,Cheese Shop
4,M4N,Park,Swim School,Bus Line,Yoga Studio,Dog Run,Festival,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space


### Fit & Train the Cluster Model
We will train the model with the data of Toronto_grouped values. The model used is k-means clustering, and the venues with the most similar venue distributions will be recognized as the same cluster 

In [30]:
## clear up the dataframe with only numerical values for modeling
Toronto_grouped_clustering = Toronto_grouped.drop('Postcode', axis = 1)

## run k-means clustering model
import sklearn
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 5, random_state = 0)

## train the model and fit with data
kmeans.fit(Toronto_grouped_clustering)

## check out the cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([3, 3, 3, 3, 0, 3, 3, 3, 1, 3], dtype=int32)

### Update the Dataframe with Cluster Labels and Location Features
We will add the cluster labels to the dataframe -- Toronto_grouped

In [31]:
## Add Clustering labels 
Toronto_venue_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
Toronto_venue_sorted.head()

Unnamed: 0,Cluster Labels,Postcode,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,3,M4E,Health Food Store,Neighborhood,Coffee Shop,Trail,Pub,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop
1,3,M4K,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Ice Cream Shop,Pub,Bookstore,Brewery,Bubble Tea Shop,Burger Joint
2,3,M4L,Pizza Place,Sandwich Place,Pet Store,Brewery,Food & Drink Shop,Steakhouse,Fish & Chips Shop,Light Rail Station,Board Shop,Liquor Store
3,3,M4M,Café,Coffee Shop,Italian Restaurant,Bakery,American Restaurant,Yoga Studio,Comfort Food Restaurant,Seafood Restaurant,Sandwich Place,Cheese Shop
4,0,M4N,Park,Swim School,Bus Line,Yoga Studio,Dog Run,Festival,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space


In [32]:
## Check out the shape of the Toronto_venue_sorted
Toronto_venue_sorted.shape

(38, 12)

In [33]:
## Check out the shape of the Toronto_selected
Toronto_selected.shape

(38, 5)

In [34]:
## Since the two dataframes have the same shape, we can merge them on the Postal Code
Toronto_merged = Toronto_selected
Toronto_merged = Toronto_merged.join(Toronto_venue_sorted.set_index('Postcode'), on = 'Postcode')

Toronto_merged.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,3,Coffee Shop,Café,Pub,Bakery,Park,Mexican Restaurant,Gym / Fitness Center,Restaurant,Breakfast Spot,Performing Arts Venue
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,3,Coffee Shop,Clothing Store,Middle Eastern Restaurant,Cosmetics Shop,Café,Bookstore,Bakery,Diner,Pizza Place,Bubble Tea Shop
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3,Coffee Shop,Restaurant,Café,Hotel,Italian Restaurant,Cosmetics Shop,Clothing Store,Bakery,Breakfast Spot,Cocktail Bar
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,3,Health Food Store,Neighborhood,Coffee Shop,Trail,Pub,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,3,Coffee Shop,Cocktail Bar,Farmers Market,Seafood Restaurant,Bakery,Steakhouse,Café,Cheese Shop,Beer Bar,Jazz Club


### Visualize the Cluster Results on the Map
We will visualize the map with different clusters of different colors

In [55]:
## import necessary lib and packages
import matplotlib.cm as cm
import matplotlib.colors as colors

## Create map
map_clusters = folium.Map(location = [lat, lng], zoom_start = 12)

## set color scheme for the clusters
x = np.arange(5)
ys = [i + x + (i*x)**2 for i in range(5)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Postcode'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

### Conclusion of Kmeans Clustering
As shown in the map above, most neighborhoods are categorized as cluster 3 and only five neighborhoods are categorized as other clusters. Since we do not know whether it is because kmeans clustering is approproate method for this dataset or it is the nature of the dataset, we will do another clustering mof the datasets with method -- Agglomerative Clustering

### Agglomerarive Clustering
We will use the same dataset -- Toronto_grouped_clustering for agglomerative clustering

### Fit & Train Agglomerative Model

In [48]:
## create an agglomerative clustering model
import sklearn
from sklearn.cluster import AgglomerativeClustering
agglom = AgglomerativeClustering(n_clusters = 6, linkage='complete')

## train the model and fit with data
agglom.fit(Toronto_grouped_clustering)

## check out the cluster labels generated for each row in the dataframe
agglom_labels = agglom.labels_
agglom_labels

array([0, 0, 0, 0, 5, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### Create a Complete Datframe with Latitudes and Cluster Labels 

In [50]:
## add labels to Toronto_venue_sorted
Toronto_venue_sorted = Toronto_venue_sorted.drop(['Cluster Labels'], axis = 1)
Toronto_venue_sorted_agglom = Toronto_venue_sorted
Toronto_venue_sorted_agglom.insert(0, 'Cluster Labels', agglom_labels)
Toronto_venue_sorted_agglom.head()

Unnamed: 0,Cluster Labels,Postcode,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,M4E,Health Food Store,Neighborhood,Coffee Shop,Trail,Pub,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop
1,0,M4K,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Ice Cream Shop,Pub,Bookstore,Brewery,Bubble Tea Shop,Burger Joint
2,0,M4L,Pizza Place,Sandwich Place,Pet Store,Brewery,Food & Drink Shop,Steakhouse,Fish & Chips Shop,Light Rail Station,Board Shop,Liquor Store
3,0,M4M,Café,Coffee Shop,Italian Restaurant,Bakery,American Restaurant,Yoga Studio,Comfort Food Restaurant,Seafood Restaurant,Sandwich Place,Cheese Shop
4,5,M4N,Park,Swim School,Bus Line,Yoga Studio,Dog Run,Festival,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space


In [51]:
## merge dataframe with lat & lng with dataframe with cluster labels
Toronto_merged_agglom = Toronto_selected
Toronto_merged_agglom = Toronto_merged_agglom.join(Toronto_venue_sorted_agglom.set_index('Postcode'), on = 'Postcode')

Toronto_merged_agglom.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,0,Coffee Shop,Café,Pub,Bakery,Park,Mexican Restaurant,Gym / Fitness Center,Restaurant,Breakfast Spot,Performing Arts Venue
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,0,Coffee Shop,Clothing Store,Middle Eastern Restaurant,Cosmetics Shop,Café,Bookstore,Bakery,Diner,Pizza Place,Bubble Tea Shop
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Coffee Shop,Restaurant,Café,Hotel,Italian Restaurant,Cosmetics Shop,Clothing Store,Bakery,Breakfast Spot,Cocktail Bar
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Neighborhood,Coffee Shop,Trail,Pub,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0,Coffee Shop,Cocktail Bar,Farmers Market,Seafood Restaurant,Bakery,Steakhouse,Café,Cheese Shop,Beer Bar,Jazz Club


### Visualize the Agglom Clustering Results 

In [56]:
## Visualize the agglomerative clustering results
## Create map
map_clusters_agglom = folium.Map(location = [lat, lng], zoom_start = 12)

## set color scheme for the clusters
x = np.arange(5)
ys = [i + x + (i*x)**2 for i in range(5)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged_agglom['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Postcode'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters_agglom)

map_clusters_agglom

### Conclusion of Agglomerative Clustering
As shown in the map above, the clustering results of Agglomerative method are the same as the results of kmeans method. This shows that it is the nature of the dataset that most neighborhoods categorized as one cluster and are thus similar