# Week 3 Project

In [127]:
import pandas as pd
import numpy as np
import requests
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

## Importing the raw table from Wikipedia

In [2]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Cleaning up the Table According to Project Specifications

### Removing Cells with 'Not assigned' boroughs

In [3]:
df = df[df['Borough'] != 'Not assigned']
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Combining Neighborhoods with the same postal code

In [4]:
df[df['Postal Code'].duplicated()]  # no duplicates found
# neighborhoods appear to already be combined within the wikipedia table

Unnamed: 0,Postal Code,Borough,Neighborhood


### Setting Neighborhood to Borough if Neighborhood is 'Not Assigned'

In [5]:
df[df['Neighborhood'] == 'Not Assigned']  # no neighborhoods are set to Not Assigned

Unnamed: 0,Postal Code,Borough,Neighborhood


In [6]:
df.shape

(103, 3)

## Geocoding

In [7]:
import geocoder

In [11]:
latitudes = []
longitudes = []
for postal in df['Postal Code']:
    cords = None
    
    while cords is None:
        geo = geocoder.arcgis('{}, Toronto, Ontario'.format(postal))
        cords = geo.json

    latitudes.append(cords['lat'])
    longitudes.append(cords['lng'])

In [16]:
df['Latitude'] = latitudes
df['Longitude'] = longitudes

In [18]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.752935,-79.335641
3,M4A,North York,Victoria Village,43.728102,-79.31189
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939


## Clustering of Neighborhoods

### Getting Data from foursquare

#### Defining API request variables

In [21]:
CLIENT_ID = 'TDA1KZJ35ARZZOFLEDJELJZBDTHUVGHHAV0FLT1TBRONLJZ3' # your Foursquare ID
CLIENT_SECRET = '13Q5BQK3ZZS1QBWZQ4O2MX43ZBGI3M3T1STLER3SEFRLF1MC' # your Foursquare Secret
VERSION = '20180604'

#### Function to get nearby venues

In [25]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return relevant information for each venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [29]:
venues = getNearbyVenues(df['Neighborhood'], df['Latitude'], df['Longitude'])

In [33]:
venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.752935,-79.335641,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.752935,-79.335641,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.728102,-79.31189,Tim Hortons,43.725517,-79.313103,Coffee Shop
3,Victoria Village,43.728102,-79.31189,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,Victoria Village,43.728102,-79.31189,Eglinton Ave E & Sloane Ave/Bermondsey Rd,43.726086,-79.31362,Intersection


In [97]:
venues.shape

(2287, 7)

### Preparation of Data for Clustering

In [98]:
# one hot encoding
onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")
onehot['Neighborhood'] = venues['Neighborhood']

# move neighborhood column to start
cols = onehot.columns
cols = cols[cols != 'Neighborhood'].insert(0, 'Neighborhood')
onehot = onehot[cols]

onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Grouping by the neighborhood

In [99]:
neighborhood_grouping = onehot.groupby('Neighborhood').mean().reset_index()
neighborhood_grouping.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
neighborhood_grouping.shape

(98, 267)

### Calculating Clusters

In [101]:
k_clusters = 6 

data_for_clustering = neighborhood_grouping.drop('Neighborhood', 1)
clusters = KMeans(n_clusters=k_clusters, random_state=11).fit(data_for_clustering)

In [102]:
clusters.labels_

array([1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 0, 1, 3, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 0, 0, 0, 5, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 2, 1, 1, 1, 1, 1, 0, 1], dtype=int32)

In [103]:
neighborhood_grouping.insert(1, 'Cluster', clusters.labels_)

In [107]:
results = neighborhood_grouping[['Neighborhood', 'Cluster']]
results.head()

Unnamed: 0,Neighborhood,Cluster
0,Agincourt,1
1,"Alderwood, Long Branch",1
2,"Bathurst Manor, Wilson Heights, Downsview North",1
3,Bayview Village,3
4,"Bedford Park, Lawrence Manor East",1


In [113]:
results = results.merge(df, on='Neighborhood', how='left')

In [122]:
results.head()

Unnamed: 0,Neighborhood,Cluster,Postal Code,Borough,Latitude,Longitude
0,Agincourt,1,M1S,Scarborough,43.79393,-79.265694
1,"Alderwood, Long Branch",1,M8W,Etobicoke,43.600895,-79.540387
2,"Bathurst Manor, Wilson Heights, Downsview North",1,M3H,North York,43.757394,-79.442394
3,Bayview Village,3,M2K,North York,43.780607,-79.376921
4,"Bedford Park, Lawrence Manor East",1,M5M,North York,43.735447,-79.417944


### The Map

In [124]:
geo = geocoder.arcgis('Toronto, Ontario')

latitude = geo.json['lat']
longitude = geo.json['lng']

In [129]:
# initalize the map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# generate colorscheme
x = np.arange(k_clusters)
ys = [i + x + (i*x)**2 for i in range(k_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(
    results['Latitude'],
    results['Longitude'],
    results['Neighborhood'],
    results['Cluster']):
    
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters