# Toronto Project Part 1: Dataframe

In [2]:
# Imports
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Libraries imported.


In [3]:
data_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0] #bring Wiki page as dataframe
data_df.rename(columns={'Postal Code': 'PostalCode', 'Neighbourhood': 'Neighborhood'}, inplace = True) # rename columns
data_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
data_df = data_df[data_df['Borough'] != 'Not assigned'] #discard all rows that have 'Not assigned' in Borough column
data_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
#Combining rows with same Postal Code, with the neighborhoods separated with a comma.
df = data_df.groupby('PostalCode', sort=False).agg(','.join) 
df.head()

Unnamed: 0_level_0,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
# Replace Neighborhood = 'Not assigned' with the Borough
df['Neighborhood'] = np.where(df['Neighborhood'] == 'Not asigned', df['Borough'], df['Neighborhood'])
df.reset_index(inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
# Display the dataframe shape
df.shape

(103, 3)

# Toronto Project Part 2: Latitude and Longitude

In [8]:
# Import geocoder
!pip install geocoder
import geocoder
print('Geocoder imported')

Geocoder imported


In [9]:
# Adding blank Lat and Lon

df['Longitude'] = None
df['Latitude'] = None
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Longitude,Latitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
3,M6A,North York,"Lawrence Manor, Lawrence Heights",,
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,


In [10]:
# Getting Lat and Lon from geocoder and adding them to df

for i, pc in enumerate(df['PostalCode']):
    lat_lng_coords = None
    
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(pc))
        lat_lng_coords = g.latlng
    
    if lat_lng_coords:
        latitude = lat_lng_coords[0]
        longitude = lat_lng_coords[1]
    
    df.loc[i, 'Latitude'] = latitude
    df.loc[i, 'Longitude'] = longitude

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Longitude,Latitude
0,M3A,North York,Parkwoods,-79.3299,43.7525
1,M4A,North York,Victoria Village,-79.3131,43.7306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",-79.3626,43.6551
3,M6A,North York,"Lawrence Manor, Lawrence Heights",-79.4504,43.7233
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",-79.3919,43.6625


# Toronto Project Part 3

### A. Exploring Neighborhoods in Toronto

In [11]:
print('The dataframe has {} boroughs and {} Postal Codes.'.format(len(df['Borough'].unique()), df.shape[0]))

The dataframe has 10 boroughs and 103 Postal Codes.


In [12]:
# Using geopy to get Lat and Lon values for Toronto
!pip install geopy
from geopy.geocoders import Nominatim

address = 'Toronto, CA'
geolocator = Nominatim(user_agent='to_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Lat and Lon of Toronto are {} / {}'.format(latitude, longitude))

Lat and Lon of Toronto are 43.6534817 / -79.3839347


In [13]:
# Import Folium

!pip install folium
import folium



In [14]:
# Creating a map of Toronto with neighborhoods superimposed on top
map_to = folium.Map(location=[latitude, longitude], zoom_start=11)

# Add markers to map
for lat, lon, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        pop=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_to)
    
map_to

In [15]:
# Foursquare Credentials

CLIENT_ID = 'XONT31F4PFQKB2PPESZQ3O1OC3DIB3PVYP4IPDQDRM14GPG2'
CLIENT_SECRET = 'M11IHZ444GCB0SVITD5GFFEEYNDXT0ZJS3YTLPQZUTYUXCWA'
VERSION = '20201201'

In [16]:
# Create Near by Venues function

import requests
LIMIT = 25
radius = 500

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


In [17]:
# Creating a dataframe for venues in Toronto
to_venues = getNearbyVenues(names=df['Neighborhood'], latitudes=df['Latitude'], longitudes=df['Longitude'])

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

In [18]:
print(to_venues.shape)
to_venues.head()

(1228, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.75245,-79.32991,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.75245,-79.32991,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.73057,-79.31306,Wigmore Park,43.731023,-79.310771,Park
3,Victoria Village,43.73057,-79.31306,Memories of Africa,43.726602,-79.312427,Grocery Store
4,Victoria Village,43.73057,-79.31306,Guardian Drug,43.730584,-79.307432,Pharmacy


In [19]:
# Venues for each Neighborhood
to_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,16,16,16,16,16,16
"Alderwood, Long Branch",4,4,4,4,4,4
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",19,19,19,19,19,19
Berczy Park,25,25,25,25,25,25
"Birch Cliff, Cliffside West",4,4,4,4,4,4
"Brockton, Parkdale Village, Exhibition Place",25,25,25,25,25,25
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",25,25,25,25,25,25
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",25,25,25,25,25,25
Caledonia-Fairbanks,7,7,7,7,7,7


### B. Analyzing each Neighborhood in Toronto

In [20]:
# One hot encoding
to_onehot = pd.get_dummies(to_venues[['Venue Category']], prefix="", prefix_sep="")

# Drop column Neighborhood which appears amongst the categories
to_onehot.drop(columns=['Neighborhood'], axis=1, inplace=True)

# Add Neighborhood back to dataframe - I use z before Neighborhood so that the columns goes last
to_onehot['zNeighborhood'] = to_venues['Neighborhood']

# Move Neighborhood column to first column
fixed_columns = [to_onehot.columns[-1]] + list(to_onehot.columns[:-1])
to_onehot = to_onehot[fixed_columns]

# Rename column Neighborhoods
to_onehot.rename(columns={'zNeighborhood': 'Neighborhood'}, inplace=True)
to_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,Auto Garage,Auto Workshop,Automotive Shop,BBQ Joint,Baby Store,Badminton Court,Bakery,Bank,Bar,Basketball Stadium,Beer Bar,Beer Store,Bike Shop,Bike Trail,Bistro,Bookstore,Boutique,Brazilian Restaurant,Breakfast Spot,Brewery,Bridge,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Line,Bus Station,Bus Stop,Business Service,Butcher,Café,Camera Store,Candy Store,Cantonese Restaurant,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Chocolate Shop,Clothing Store,Cocktail Bar,Coffee Shop,College Gym,College Rec Center,College Stadium,Comfort Food Restaurant,Comic Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Creperie,Cuban Restaurant,Cupcake Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Distribution Center,Dog Run,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Event Space,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field,Fish & Chips Shop,Fish Market,Flea Market,Flower Shop,Food & Drink Shop,Food Court,Food Truck,Fountain,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,Gaming Cafe,Gas Station,Gastropub,Gay Bar,General Entertainment,Gift Shop,Gluten-free Restaurant,Golf Course,Golf Driving Range,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Hardware Store,Health Food Store,Historic Site,History Museum,Hobby Shop,Hockey Arena,Home Service,Hong Kong Restaurant,Hookah Bar,Hotel,Hotel Bar,IT Services,Ice Cream Shop,Indian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Kitchen Supply Store,Korean Restaurant,Latin American Restaurant,Leather Goods Store,Light Rail Station,Liquor Store,Lounge,Market,Massage Studio,Mattress Store,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant,Movie Theater,Museum,Music Venue,New American Restaurant,Newsagent,Nightclub,Noodle House,Opera House,Optical Shop,Organic Grocery,Other Great Outdoors,Park,Performing Arts Venue,Peruvian Restaurant,Pet Store,Pharmacy,Pizza Place,Playground,Plaza,Poke Place,Pool,Portuguese Restaurant,Print Shop,Pub,Ramen Restaurant,Record Shop,Rental Car Location,Residential Building (Apartment / Condo),Restaurant,Road,Rock Climbing Spot,Roof Deck,Salad Place,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Shanghai Restaurant,Shoe Store,Shop & Service,Shopping Mall,Skating Rink,Smoke Shop,Smoothie Shop,Soccer Field,Spa,Speakeasy,Sporting Goods Shop,Sports Bar,Sports Club,Steakhouse,Storage Facility,Supermarket,Sushi Restaurant,Swim School,Tailor Shop,Taiwanese Restaurant,Tea Room,Tennis Court,Thai Restaurant,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio,Zoo Exhibit
0,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
to_onehot.shape

(1228, 216)

In [22]:
# Group by Neighborhood and take the mean of frequency of each Category

to_grouped = to_onehot.groupby('Neighborhood').mean().reset_index()
print(to_grouped.shape)

(96, 216)


In [23]:
# Finding top 3 most common venues per Neighborhood
num = 3

for hood in to_grouped['Neighborhood']:
    print('-----'+hood+'-----')
    temp = to_grouped[to_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num))
    print('\n')

-----Agincourt-----
                  venue  freq
0    Chinese Restaurant  0.12
1  Hong Kong Restaurant  0.06
2       Badminton Court  0.06


-----Alderwood, Long Branch-----
               venue  freq
0  Convenience Store  0.25
1                Pub  0.25
2                Gym  0.25


-----Bayview Village-----
                        venue  freq
0  Construction & Landscaping  0.25
1                        Park  0.25
2          Golf Driving Range  0.25


-----Bedford Park, Lawrence Manor East-----
                venue  freq
0  Italian Restaurant  0.11
1         Coffee Shop  0.11
2      Sandwich Place  0.11


-----Berczy Park-----
                venue  freq
0            Beer Bar  0.08
1  Seafood Restaurant  0.08
2      Farmers Market  0.08


-----Birch Cliff, Cliffside West-----
                   venue  freq
0  General Entertainment  0.25
1           Skating Rink  0.25
2        College Stadium  0.25


-----Brockton, Parkdale Village, Exhibition Place-----
                    venue  fre

                       venue  freq
0                        Gym  0.12
1  Middle Eastern Restaurant  0.12
2                Gas Station  0.12


-----Northwood Park, York University-----
                    venue  freq
0  Furniture / Home Store  0.14
1                     Bar  0.07
2              Restaurant  0.07


-----Old Mill South, King's Mill Park, Sunnylea, Humber Bay, Mimico NE, The Queensway East, Royal York South East, Kingsway Park South East-----
                  venue  freq
0                  Bank  0.12
1                  Park  0.12
2  Fast Food Restaurant  0.12


-----Parkdale, Roncesvalles-----
                         venue  freq
0          American Restaurant  0.08
1  Eastern European Restaurant  0.08
2                  Coffee Shop  0.08


-----Parkview Hill, Woodbine Gardens-----
          venue  freq
0   Pizza Place  0.13
1      Bus Line  0.07
2  Intersection  0.07


-----Parkwoods-----
               venue  freq
0  Food & Drink Shop   0.5
1               Park   0.5
2  

In [24]:
# Getting that informationg into a dataframe - defining function
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [49]:
# Getting that informationg into a dataframe - displaying top 3 venues for each Neighborhood

num_top_venues = 3
indicators = ['st', 'nd', 'rd']

# Creating columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind])) # is gonna work because num_top_venues = 3
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
# Creating new dataframe
nei_venues = pd.DataFrame(columns=columns)
nei_venues['Neighborhood']=to_grouped['Neighborhood']

for ind in np.arange(to_grouped.shape[0]):
    nei_venues.iloc[ind, 1:]=return_most_common_venues(to_grouped.iloc[ind, :], num_top_venues)

print(nei_venues.shape)
nei_venues.head()

(96, 4)


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Agincourt,Chinese Restaurant,Grocery Store,Department Store
1,"Alderwood, Long Branch",Convenience Store,Gym,Pub
2,Bayview Village,Construction & Landscaping,Park,Golf Driving Range
3,"Bedford Park, Lawrence Manor East",Italian Restaurant,Sandwich Place,Coffee Shop
4,Berczy Park,Seafood Restaurant,Farmers Market,Beer Bar


### C. KMeans Clustering - 5 clusters

In [28]:
# Importing KMeans
from sklearn.cluster import KMeans 

In [42]:
# Number of clusters
kclusters = 5

to_grouped_clustering = to_grouped.drop('Neighborhood', 1)

# Run K-Means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(to_grouped_clustering)

kmeans.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 3, 3, 4, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 3, 3])

In [50]:
# Add clustering label to dataframe

nei_venues.insert(0, 'Cluster Labels', kmeans.labels_)
nei_venues.head()

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,0,Agincourt,Chinese Restaurant,Grocery Store,Department Store
1,0,"Alderwood, Long Branch",Convenience Store,Gym,Pub
2,0,Bayview Village,Construction & Landscaping,Park,Golf Driving Range
3,0,"Bedford Park, Lawrence Manor East",Italian Restaurant,Sandwich Place,Coffee Shop
4,0,Berczy Park,Seafood Restaurant,Farmers Market,Beer Bar


In [51]:
# Adding PostalCode, Borough, Latitude and Longitude 

to_merged = df
to_merged = to_merged.join(nei_venues.set_index('Neighborhood'), on='Neighborhood')

# Dropping 3 rows with NaN as Cluster Labels
to_merged.dropna(subset=['Cluster Labels'], axis=0, inplace=True)

# Converting Cluster Labels to integer, needed for visualization
to_merged['Cluster Labels'] = to_merged['Cluster Labels'].astype(int)

print(to_merged.shape)
to_merged

(100, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,Longitude,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M3A,North York,Parkwoods,-79.3299,43.7525,3,Food & Drink Shop,Park,Zoo Exhibit
1,M4A,North York,Victoria Village,-79.3131,43.7306,3,Pharmacy,Grocery Store,Park
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",-79.3626,43.6551,0,Coffee Shop,Breakfast Spot,Restaurant
3,M6A,North York,"Lawrence Manor, Lawrence Heights",-79.4504,43.7233,0,Clothing Store,Cosmetics Shop,Furniture / Home Store
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",-79.3919,43.6625,0,Coffee Shop,Sandwich Place,Bank
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",-79.5283,43.6626,0,Pharmacy,Bank,Grocery Store
6,M1B,Scarborough,"Malvern, Rouge",-79.1966,43.8114,0,Zoo Exhibit,Fast Food Restaurant,Furniture / Home Store
7,M3B,North York,Don Mills,-79.3619,43.7492,0,Coffee Shop,Intersection,Gym
8,M4B,East York,"Parkview Hill, Woodbine Gardens",-79.3119,43.7072,0,Pizza Place,Gym / Fitness Center,Gastropub
9,M5B,Downtown Toronto,"Garden District, Ryerson",-79.378,43.6574,0,Café,Theater,Ramen Restaurant


In [54]:
# Clusters visualization

import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(to_merged['Latitude'], to_merged['Longitude'], to_merged['Neighborhood'], to_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### D. Examining the 5 clusters

In [58]:
# Cluster 0
to_merged.loc[to_merged['Cluster Labels'] ==0, to_merged.columns[[2] + list (range(5, to_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
2,"Regent Park, Harbourfront",0,Coffee Shop,Breakfast Spot,Restaurant
3,"Lawrence Manor, Lawrence Heights",0,Clothing Store,Cosmetics Shop,Furniture / Home Store
4,"Queen's Park, Ontario Provincial Government",0,Coffee Shop,Sandwich Place,Bank
5,"Islington Avenue, Humber Valley Village",0,Pharmacy,Bank,Grocery Store
6,"Malvern, Rouge",0,Zoo Exhibit,Fast Food Restaurant,Furniture / Home Store
7,Don Mills,0,Coffee Shop,Intersection,Gym
8,"Parkview Hill, Woodbine Gardens",0,Pizza Place,Gym / Fitness Center,Gastropub
9,"Garden District, Ryerson",0,Café,Theater,Ramen Restaurant
10,Glencairn,0,Pizza Place,Grocery Store,Pub
11,"West Deane Park, Princess Gardens, Martin Grov...",0,Pizza Place,Tea Room,Print Shop


In [59]:
# Cluster 1
to_merged.loc[to_merged['Cluster Labels'] ==1, to_merged.columns[[2] + list (range(5, to_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
98,"The Kingsway, Montgomery Road, Old Mill North",1,Pool,Zoo Exhibit,Distribution Center


In [60]:
# Cluster 2
to_merged.loc[to_merged['Cluster Labels'] ==2, to_merged.columns[[2] + list (range(5, to_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
71,"Wexford, Maryvale",2,Auto Garage,Zoo Exhibit,Distribution Center


In [61]:
# Cluster 3
to_merged.loc[to_merged['Cluster Labels'] ==3, to_merged.columns[[2] + list (range(5, to_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Parkwoods,3,Food & Drink Shop,Park,Zoo Exhibit
1,Victoria Village,3,Pharmacy,Grocery Store,Park
18,"Guildwood, Morningside, West Hill",3,Construction & Landscaping,Gym / Fitness Center,Park
27,Hillcrest Village,3,Park,Residential Building (Apartment / Condo),Zoo Exhibit
35,"East Toronto, Broadview North (Old East York)",3,Intersection,Convenience Store,Park
45,"York Mills, Silver Hills",3,Park,Zoo Exhibit,Distribution Center
49,"North Park, Maple Leaf Park, Upwood Park",3,Park,Bakery,Zoo Exhibit
66,York Mills West,3,Speakeasy,Convenience Store,Park
68,"Forest Hill North & West, Forest Hill Road Park",3,Park,French Restaurant,Zoo Exhibit
69,"High Park, The Junction South",3,Convenience Store,Park,Residential Building (Apartment / Condo)


In [62]:
# Cluster 4
to_merged.loc[to_merged['Cluster Labels'] ==4, to_merged.columns[[2] + list (range(5, to_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
12,"Rouge Hill, Port Union, Highland Creek",4,Construction & Landscaping,Bar,Dog Run
50,Humber Summit,4,Construction & Landscaping,Furniture / Home Store,Dog Run
