## Importing required libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import csv

In [2]:
import requests

In [138]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

## Setting maxcolwidth to 800 for readability

In [3]:
pd.set_option('max_colwidth', 800)

## instantiating BeautifulSoup object and reading the table from wikipedia page

In [4]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text 
soup = BeautifulSoup(source, 'lxml')

## Creating a csv_writer to append the scraped content in an xlx file by initally defining column names 

In [5]:
csv_file = open('toronto_postal_codes.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Postcode', 'Borough', 'Neighbourhood'])

32

## Main Scraping of data begins here

In [6]:
table = soup.find('table', class_ = 'wikitable') # Gets the table from the webpage
rows = table.find_all('tr') # Gets the table rows

postcodes = [] # Initializes the raw postcodes list
boroughs = [] # Initializes the raw boroughs list
neighbourhoods = [] # Initializes the raw neighbourhoods list

for row in rows:    
    columns = row.find_all('td')
    try :
        if columns[1].text != 'Not assigned':  # To skip if the borough name is 'Not Assigned'
            
            postcode = columns[0].text
            postcodes.append(postcode)
            
            borough = columns[1].text
            boroughs.append(borough)
            
            neighbourhood = columns[2].text.split('\n')[0] # Removing the newline character at the end     
            
            if neighbourhood == 'Not assigned': # Assigning the same name to neighbourhood if it is 'Not Assigned'
                neighbourhood = borough            
                
            neighbourhoods.append(neighbourhood)
             
    except Exception as e : # To skip the first row which contains column names
        pass 
    
postcode_explored = [] # Initializing the list of explored postcodes
for index_i, postcode_i in enumerate(postcodes) :   
    if postcode_i not in postcode_explored :
        nbds = neighbourhoods[index_i]
        for index_f, postcode_f in enumerate(postcodes) :
            if postcode_i == postcode_f and index_i != index_f:
                nbds = nbds + ', ' + neighbourhoods[index_f] # Concatenating the neighbourhood names
        csv_writer.writerow([postcode_i, boroughs[index_i], nbds]) # Writing the rows in the csv file
        postcode_explored.append(postcode_i)


## Closing the CSV file

In [7]:
csv_file.close()

## Creating a pandas dataframe

In [8]:
toronto_df=pd.read_csv('toronto_postal_codes.csv')

## Gauging the shape of the created pandas dataframe

In [9]:
toronto_df.shape

(180, 3)

## Final Reformatting and resultant dataframe

In [10]:
toronto_df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A\r\n,Not assigned\r\n,
1,M2A\r\n,Not assigned\r\n,
2,M3A\r\n,North York\r\n,Parkwoods
3,M4A\r\n,North York\r\n,Victoria Village
4,M5A\r\n,Downtown Toronto\r\n,Regent Park / Harbourfront


In [11]:
toronto_df=toronto_df.replace('\r\n', '',regex=True)

In [12]:
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [13]:
toronto_df_copy=toronto_df #making a copy for redundancy 

In [14]:
drop_index=toronto_df.loc[toronto_df['Borough']=='Not assigned'].index

In [15]:
drop_index

Int64Index([  0,   1,   7,  10,  15,  16,  19,  24,  25,  28,  29,  33,  34,
             35,  37,  38,  42,  43,  44,  51,  52,  53,  60,  61,  62,  69,
             70,  71,  78,  79,  87,  88,  96,  97, 101, 105, 106, 110, 115,
            118, 119, 123, 124, 125, 127, 128, 131, 132, 133, 134, 136, 137,
            140, 141, 145, 146, 149, 150, 154, 155, 158, 159, 161, 162, 163,
            164, 166, 167, 170, 171, 172, 173, 174, 175, 176, 177, 179],
           dtype='int64')

In [16]:
toronto_df_copy.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [17]:
toronto_df_copy.drop(drop_index,inplace=True)

In [18]:
toronto_df_copy.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [19]:
toronto_df_copy.reset_index(inplace=True)

In [20]:
toronto_df_copy.head()

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Regent Park / Harbourfront
3,5,M6A,North York,Lawrence Manor / Lawrence Heights
4,6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [21]:
toronto_df_copy=toronto_df_copy.drop('index',1)

In [22]:
toronto_df_copy['Neighbourhood'].replace('/',',',regex=True,inplace=True)

In [28]:
t=toronto_df_copy[['Neighbourhood']]
t.replace('/',',',regex=True,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


In [23]:
toronto_df_copy.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [24]:
toronto_df_copy.shape

(103, 3)

## Testing Geopy library

In [25]:
from geopy.geocoders import Nominatim
nom=Nominatim()

  


In [26]:
toronto_df_copy.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [254]:
toronto_df_copy.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea , Humber Bay , Mimico NE , The Queensway East , Royal York South East , Kingsway Park South East"
102,M8Z,Etobicoke,"Mimico NW , The Queensway West , South of Bloor , Kingsway Park South West , Royal York South West"


In [260]:
x=nom.geocode("Downtown Toronto Queen's Park, Toronto")

In [27]:
toronto_df_copy['Address']=toronto_df_copy['Borough']+" "+toronto_df_copy['Neighbourhood']

In [28]:
toronto_df_copy.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Address
0,M3A,North York,Parkwoods,North York Parkwoods
1,M4A,North York,Victoria Village,North York Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront","Downtown Toronto Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights","North York Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government","Downtown Toronto Queen's Park , Ontario Provincial Government"


In [29]:
toronto_df_copy['Coordinates']=toronto_df_copy['Address'].apply(nom.geocode)

GeocoderTimedOut: Service timed out

In [30]:
toronto_df_copy.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Address
0,M3A,North York,Parkwoods,North York Parkwoods
1,M4A,North York,Victoria Village,North York Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront","Downtown Toronto Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights","North York Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government","Downtown Toronto Queen's Park , Ontario Provincial Government"


In [269]:
toronto_df_copy.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood,Address,Coordinates
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North","Etobicoke The Kingsway , Montgomery Road , Old Mill North",
99,M4Y,Downtown Toronto,Church and Wellesley,Downtown Toronto Church and Wellesley,"(Holiday Inn Toronto Downtown Centre, 30, Carlton Street, Church-Wellesley Village, Toronto Centre, Old Toronto, Toronto, Golden Horseshoe, Ontario, M5B 1L2, Canada, (43.6617403, -79.3810866))"
100,M7Y,East Toronto,Business reply mail Processing CentrE,East Toronto Business reply mail Processing CentrE,
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea , Humber Bay , Mimico NE , The Queensway East , Royal York South East , Kingsway Park South East","Etobicoke Old Mill South , King's Mill Park , Sunnylea , Humber Bay , Mimico NE , The Queensway East , Royal York South East , Kingsway Park South East",
102,M8Z,Etobicoke,"Mimico NW , The Queensway West , South of Bloor , Kingsway Park South West , Royal York South West","Etobicoke Mimico NW , The Queensway West , South of Bloor , Kingsway Park South West , Royal York South West",


In [272]:
toronto_df_copy

Unnamed: 0,Postcode,Borough,Neighbourhood,Address,Coordinates
0,M3A,North York,Parkwoods,North York Parkwoods,"(Parkwoods Village Drive, Parkway East, Don Valley East, North York, Toronto, Golden Horseshoe, Ontario, M3A 2X2, Canada, (43.7587999, -79.3201966))"
1,M4A,North York,Victoria Village,North York Victoria Village,"(Victoria Village, Don Valley East, North York, Toronto, Golden Horseshoe, Ontario, M4A 2B1, Canada, (43.732658, -79.3111892))"
2,M5A,Downtown Toronto,"Regent Park , Harbourfront","Downtown Toronto Regent Park , Harbourfront",
3,M6A,North York,"Lawrence Manor , Lawrence Heights","North York Lawrence Manor , Lawrence Heights","(Lawrence Heights, Eglinton—Lawrence, North York, Toronto, Golden Horseshoe, Ontario, M6A 2R1, Canada, (43.7227784, -79.4509332))"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government","Downtown Toronto Queen's Park , Ontario Provincial Government",
5,M9A,Etobicoke,Islington Avenue,Etobicoke Islington Avenue,"(Islington Avenue, The Queensway, Etobicoke—Lakeshore, Etobicoke, Toronto, Golden Horseshoe, Ontario, M8Z 6C7, Canada, (43.6225748, -79.5142154))"
6,M1B,Scarborough,"Malvern , Rouge","Scarborough Malvern , Rouge","(Baton Rouge, 520, Progress Avenue, Scarborough, Scarborough Centre, Scarborough, Toronto, Golden Horseshoe, Ontario, M1P 5J1, Canada, (43.7792995, -79.25712648470281))"
7,M3B,North York,Don Mills,North York Don Mills,"(Don Mills, Sheppard Avenue East, Parkway Forest, North York, Toronto, Golden Horseshoe, Ontario, M2J 5A7, Canada, (43.775347, -79.3459439))"
8,M4B,East York,"Parkview Hill , Woodbine Gardens","East York Parkview Hill , Woodbine Gardens",
9,M5B,Downtown Toronto,"Garden District, Ryerson","Downtown Toronto Garden District, Ryerson","(Ryerson Theatre, 31,43, Gerrard Street East, Downtown Yonge, Toronto Centre, Old Toronto, Toronto, Golden Horseshoe, Ontario, M5B 1G7, Canada, (43.6593908, -79.3795591))"


In [283]:
toronto_df_copy['Latitude']=toronto_df_copy['Coordinates'].apply(lambda x: x.latitude if x!=None else None)
toronto_df_copy['Longitude']=toronto_df_copy['Coordinates'].apply(lambda y: y.longitude if y!=None else None)

In [42]:
toronto_df_copy.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Address
0,M3A,North York,Parkwoods,North York Parkwoods
1,M4A,North York,Victoria Village,North York Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront","Downtown Toronto Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights","North York Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government","Downtown Toronto Queen's Park , Ontario Provincial Government"


In [31]:
toronto_df_copy.drop(['Address'],1)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [32]:
loc_df=pd.read_csv('Geospatial_Coordinates.csv')

In [33]:
loc_df.shape

(103, 3)

In [34]:
loc_df.rename(columns={'Postal Code':'Postcode'},inplace=True)

In [35]:
test=pd.merge(toronto_df_copy,loc_df, on='Postcode', how='inner')

In [41]:
test.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Address,Latitude,Longitude
0,M3A,North York,Parkwoods,North York Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,North York Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront","Downtown Toronto Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights","North York Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government","Downtown Toronto Queen's Park , Ontario Provincial Government",43.662301,-79.389494


In [42]:
toronto_df_copy=test

In [43]:
toronto_df_copy.drop('Address',1,inplace=True)

In [343]:
toronto_df_copy.rename(columns={'Latitude_y':'Latitude','Longitude_y':'Longitude'},inplace=True)

In [44]:
toronto_df_copy.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


## Importing the necessary libraries for neighborhood analysis 

In [37]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium # Map plotting library
import numpy as np
from pandas.io.json import json_normalize # Tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# Import k-means from clustering stage
from sklearn.cluster import KMeans

Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [46]:
#Toronto Coordinate to creat a folium map
tor_lat = 43.6532
tor_lng = -79.3832
toronto_map=folium.Map(location=[tor_lat,tor_lng],zoom_start=10)

#adding markers to the map
for lat,lng,borough,neigh in zip(toronto_df_copy['Latitude'],toronto_df_copy['Longitude'],toronto_df_copy['Borough'],toronto_df_copy['Neighbourhood']):
    label='{},{}'.format(neigh,borough) #labels would prompt borough and neighbourhood name
    label=folium.Popup(label,parse_html=True)
    folium.CircleMarker([lat, lng],   #circle markers for every plotted latitude and longitude
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map).add_to(toronto_map)
    
toronto_map
    

## instantiating Foursquare API

In [47]:
CLIENT_ID='JAILAMIF4H0FPZZ22ZKU4LDPQJBRNE4N1C3YH4NYH5OZNBL1' # Foursquare ID
CLIENT_SECRET='BP4XAUNU52R3G1LEWLKWOPLSTHCLMTOWCR2YW05LZIPQGOPM' # Foursquare Secret
VERSION='20180605'

## We reduce the Foursquare API calls by reducing the results only to inlcude boroughs named toronto

In [57]:
toronto_borough = ['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']
toronto_df_copy = toronto_df_copy[toronto_df_copy['Borough'].isin(toronto_borough)].reset_index(drop=True)
print(toronto_df_copy.shape)
toronto_df_copy.head()

(39, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


##  iteratively extracting venue category and making a list

In [60]:

radius = 500
LIMIT = 100

venues_list = []

for lat,lng, post, borough, neighborhood in zip(toronto_df_copy['Latitude'],toronto_df_copy['Longitude'],toronto_df_copy['Postcode'],toronto_df_copy['Borough'],toronto_df_copy['Neighbourhood']):
    # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items'] # the necessary data for consideration, venue details lies within the items sublist, within groups sublist which is inturn within the "response" master list
    
    for v in results:
        venues_list.append((
                post,
                borough,
                neighborhood,
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']))


### we now create a seperate dataframe for the viewing of the venues nearby

In [61]:
toronto_df_venues=pd.DataFrame(venues_list)

In [64]:
toronto_df_venues.columns = ['Postcode', 
                     'Borough', 
                     'Neighbourhood',
                     'Borough Latitude', 
                     'Borough Longitude', 
                     'Venue Name',
                     'Venue Latitude',
                     'Venue Logitude',
                     'Venue Category']
print(toronto_df_venues.shape)
toronto_df_venues.head()

(1622, 9)


Unnamed: 0,Postcode,Borough,Neighbourhood,Borough Latitude,Borough Longitude,Venue Name,Venue Latitude,Venue Logitude,Venue Category
0,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


### getting a count for the number of entries within each borough name 

In [65]:
toronto_df_venues.groupby('Borough').count()

Unnamed: 0_level_0,Postcode,Neighbourhood,Borough Latitude,Borough Longitude,Venue Name,Venue Latitude,Venue Logitude,Venue Category
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Central Toronto,112,112,112,112,112,112,112,112
Downtown Toronto,1228,1228,1228,1228,1228,1228,1228,1228
East Toronto,123,123,123,123,123,123,123,123
West Toronto,159,159,159,159,159,159,159,159


### lets figure out how many unique categories are there

In [71]:
print("there are {} unique venue categories in the dataframe".format(len(toronto_df_venues['Venue Category'].unique())))

there are 227 unique venue categories in the dataframe


### We one hot encode different categories to better analyse boroughs

In [72]:
toronto_df_onehot=pd.get_dummies(toronto_df_venues[['Venue Category']],prefix="",prefix_sep="")

In [73]:
toronto_df_onehot.head()

Unnamed: 0,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### as we can see there is no "Borough column" to provide insight about so we add it back from the toronto_df_venue df

In [123]:
toronto_df_onehot.drop('Borough',1,inplace=True)

In [124]:
toronto_df_onehot['Borough']=toronto_df_venues['Borough']
toronto_df_onehot.head()
#move Borough to first column
fixed_columns = [toronto_df_onehot.columns[-1]] + list(toronto_df_onehot.columns[:-1])
toronto_df_onehot = toronto_df_onehot[fixed_columns]
toronto_df_onehot.head()

Unnamed: 0,Borough,Taco Place,Tailor Shop,Taiwanese Restaurant,Tanning Salon,Tapas Restaurant,Tea Room,Tennis Court,Thai Restaurant,Theater,...,Speakeasy,Sporting Goods Shop,Sports Bar,Stadium,Stationery Store,Steakhouse,Strip Club,Supermarket,Sushi Restaurant,Swim School
0,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### We now group boroughs by venue categories factoring in the mean of frequency of the same 

In [125]:
toronto_df_group=toronto_df_onehot.groupby('Borough').mean().reset_index()

In [126]:
toronto_df_group.head()

Unnamed: 0,Borough,Taco Place,Tailor Shop,Taiwanese Restaurant,Tanning Salon,Tapas Restaurant,Tea Room,Tennis Court,Thai Restaurant,Theater,...,Speakeasy,Sporting Goods Shop,Sports Bar,Stadium,Stationery Store,Steakhouse,Strip Club,Supermarket,Sushi Restaurant,Swim School
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.008929,0.008929,0.0,...,0.0,0.008929,0.008929,0.0,0.0,0.0,0.0,0.008929,0.035714,0.008929
1,Downtown Toronto,0.000814,0.004072,0.000814,0.000814,0.0,0.008958,0.0,0.011401,0.009772,...,0.002443,0.004886,0.001629,0.0,0.0,0.010586,0.000814,0.000814,0.009772,0.0
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00813,0.0,...,0.0,0.0,0.0,0.0,0.00813,0.00813,0.0,0.0,0.00813,0.0
3,West Toronto,0.0,0.0,0.0,0.0,0.006289,0.006289,0.0,0.012579,0.006289,...,0.006289,0.0,0.0,0.006289,0.0,0.0,0.0,0.006289,0.012579,0.0


### we now rank the top 5 categories of venue for each borough

In [127]:
num_top_venues = 5

for hood in toronto_df_group['Borough']:
    print("----"+hood+"----")
    temp = toronto_df_group[toronto_df_group['Borough'] == hood].T.reset_index()
    temp.columns = ['Venue', 'Frequency']
    temp = temp.iloc[1:]
    temp['Frequency'] = temp['Frequency'].astype(float)
    temp = temp.round({'Frequency':2})
    print(temp.sort_values('Frequency', ascending = False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Central Toronto----
            Venue  Frequency
0     Coffee Shop       0.07
1  Sandwich Place       0.06
2            Café       0.05
3            Park       0.05
4    Dessert Shop       0.04


----Downtown Toronto----
                Venue  Frequency
0         Coffee Shop       0.10
1                Café       0.06
2          Restaurant       0.04
3  Italian Restaurant       0.03
4               Hotel       0.03


----East Toronto----
                Venue  Frequency
0    Greek Restaurant       0.07
1         Coffee Shop       0.05
2                Café       0.04
3                Park       0.04
4  Italian Restaurant       0.04


----West Toronto----
                Venue  Frequency
0                Café       0.07
1                 Bar       0.07
2         Coffee Shop       0.05
3  Italian Restaurant       0.04
4          Restaurant       0.04




### a function to return the most common venues 

In [128]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [129]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
Borough_venues_sorted = pd.DataFrame(columns=columns)
Borough_venues_sorted['Borough'] = toronto_df_group['Borough']

for ind in np.arange(toronto_df_group.shape[0]):
    Borough_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_df_group.iloc[ind, :], num_top_venues)

Borough_venues_sorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,Coffee Shop,Sandwich Place,Park,Café,Pizza Place,Dessert Shop,Sushi Restaurant,Restaurant,Pub,Gym
1,Downtown Toronto,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Gym,Bakery,Beer Bar
2,East Toronto,Greek Restaurant,Coffee Shop,Café,Park,Brewery,Italian Restaurant,Restaurant,Ice Cream Shop,Bookstore,American Restaurant
3,West Toronto,Bar,Café,Coffee Shop,Restaurant,Italian Restaurant,Grocery Store,Park,Bakery,Gift Shop,Breakfast Spot


## IMPLEMENTING K-Means Clustering

In [130]:
toronto_df_group.head()

Unnamed: 0,Borough,Taco Place,Tailor Shop,Taiwanese Restaurant,Tanning Salon,Tapas Restaurant,Tea Room,Tennis Court,Thai Restaurant,Theater,...,Speakeasy,Sporting Goods Shop,Sports Bar,Stadium,Stationery Store,Steakhouse,Strip Club,Supermarket,Sushi Restaurant,Swim School
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.008929,0.008929,0.0,...,0.0,0.008929,0.008929,0.0,0.0,0.0,0.0,0.008929,0.035714,0.008929
1,Downtown Toronto,0.000814,0.004072,0.000814,0.000814,0.0,0.008958,0.0,0.011401,0.009772,...,0.002443,0.004886,0.001629,0.0,0.0,0.010586,0.000814,0.000814,0.009772,0.0
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00813,0.0,...,0.0,0.0,0.0,0.0,0.00813,0.00813,0.0,0.0,0.00813,0.0
3,West Toronto,0.0,0.0,0.0,0.0,0.006289,0.006289,0.0,0.012579,0.006289,...,0.006289,0.0,0.0,0.006289,0.0,0.0,0.0,0.006289,0.012579,0.0


In [131]:
toronto_df_test=toronto_df_group.drop('Borough',1)
k_clusters=4

In [133]:
kmeans=KMeans(n_clusters=k_clusters,random_state=0).fit(toronto_df_test)
kmeans.labels_

array([0, 3, 2, 1])

## restructuring the dataframe to also include the cluster number

In [134]:
Borough_venues_sorted.insert(0, 'Cluster Lables', kmeans.labels_)
toronto_final = toronto_df_copy
toronto_final = toronto_final.join(Borough_venues_sorted.set_index('Borough'), on='Borough')

toronto_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Lables,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,3,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Gym,Bakery,Beer Bar
1,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494,3,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Gym,Bakery,Beer Bar
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,3,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Gym,Bakery,Beer Bar
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Gym,Bakery,Beer Bar
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,2,Greek Restaurant,Coffee Shop,Café,Park,Brewery,Italian Restaurant,Restaurant,Ice Cream Shop,Bookstore,American Restaurant


## Visualizing the clusters formed

In [140]:
# create map
map_clusters = folium.Map(location=[tor_lat, tor_lng], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k_clusters)
ys = [i + x + (i*x)**2 for i in range(k_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_final['Latitude'], toronto_final['Longitude'], toronto_final['Borough'], toronto_final['Cluster Lables']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [143]:
toronto_final.loc[toronto_final['Cluster Lables'] == 0, toronto_final.columns[[1] + list(range(4, toronto_final.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Lables,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,Central Toronto,-79.38879,0,Coffee Shop,Sandwich Place,Park,Café,Pizza Place,Dessert Shop,Sushi Restaurant,Restaurant,Pub,Gym
19,Central Toronto,-79.416936,0,Coffee Shop,Sandwich Place,Park,Café,Pizza Place,Dessert Shop,Sushi Restaurant,Restaurant,Pub,Gym
20,Central Toronto,-79.390197,0,Coffee Shop,Sandwich Place,Park,Café,Pizza Place,Dessert Shop,Sushi Restaurant,Restaurant,Pub,Gym
21,Central Toronto,-79.411307,0,Coffee Shop,Sandwich Place,Park,Café,Pizza Place,Dessert Shop,Sushi Restaurant,Restaurant,Pub,Gym
23,Central Toronto,-79.405678,0,Coffee Shop,Sandwich Place,Park,Café,Pizza Place,Dessert Shop,Sushi Restaurant,Restaurant,Pub,Gym
24,Central Toronto,-79.405678,0,Coffee Shop,Sandwich Place,Park,Café,Pizza Place,Dessert Shop,Sushi Restaurant,Restaurant,Pub,Gym
26,Central Toronto,-79.38879,0,Coffee Shop,Sandwich Place,Park,Café,Pizza Place,Dessert Shop,Sushi Restaurant,Restaurant,Pub,Gym
29,Central Toronto,-79.38316,0,Coffee Shop,Sandwich Place,Park,Café,Pizza Place,Dessert Shop,Sushi Restaurant,Restaurant,Pub,Gym
31,Central Toronto,-79.400049,0,Coffee Shop,Sandwich Place,Park,Café,Pizza Place,Dessert Shop,Sushi Restaurant,Restaurant,Pub,Gym


In [145]:
toronto_final.loc[toronto_final['Cluster Lables'] == 1, toronto_final.columns[[1] + list(range(4, toronto_final.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Lables,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,West Toronto,-79.442259,1,Bar,Café,Coffee Shop,Restaurant,Italian Restaurant,Grocery Store,Park,Bakery,Gift Shop,Breakfast Spot
11,West Toronto,-79.41975,1,Bar,Café,Coffee Shop,Restaurant,Italian Restaurant,Grocery Store,Park,Bakery,Gift Shop,Breakfast Spot
14,West Toronto,-79.428191,1,Bar,Café,Coffee Shop,Restaurant,Italian Restaurant,Grocery Store,Park,Bakery,Gift Shop,Breakfast Spot
22,West Toronto,-79.464763,1,Bar,Café,Coffee Shop,Restaurant,Italian Restaurant,Grocery Store,Park,Bakery,Gift Shop,Breakfast Spot
25,West Toronto,-79.456325,1,Bar,Café,Coffee Shop,Restaurant,Italian Restaurant,Grocery Store,Park,Bakery,Gift Shop,Breakfast Spot
28,West Toronto,-79.48445,1,Bar,Café,Coffee Shop,Restaurant,Italian Restaurant,Grocery Store,Park,Bakery,Gift Shop,Breakfast Spot


In [146]:
toronto_final.loc[toronto_final['Cluster Lables'] == 2, toronto_final.columns[[1] + list(range(4, toronto_final.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Lables,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,East Toronto,-79.293031,2,Greek Restaurant,Coffee Shop,Café,Park,Brewery,Italian Restaurant,Restaurant,Ice Cream Shop,Bookstore,American Restaurant
12,East Toronto,-79.352188,2,Greek Restaurant,Coffee Shop,Café,Park,Brewery,Italian Restaurant,Restaurant,Ice Cream Shop,Bookstore,American Restaurant
15,East Toronto,-79.315572,2,Greek Restaurant,Coffee Shop,Café,Park,Brewery,Italian Restaurant,Restaurant,Ice Cream Shop,Bookstore,American Restaurant
17,East Toronto,-79.340923,2,Greek Restaurant,Coffee Shop,Café,Park,Brewery,Italian Restaurant,Restaurant,Ice Cream Shop,Bookstore,American Restaurant
38,East Toronto,-79.321558,2,Greek Restaurant,Coffee Shop,Café,Park,Brewery,Italian Restaurant,Restaurant,Ice Cream Shop,Bookstore,American Restaurant


In [147]:
toronto_final.loc[toronto_final['Cluster Lables'] == 3, toronto_final.columns[[1] + list(range(4, toronto_final.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Lables,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,-79.360636,3,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Gym,Bakery,Beer Bar
1,Downtown Toronto,-79.389494,3,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Gym,Bakery,Beer Bar
2,Downtown Toronto,-79.378937,3,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Gym,Bakery,Beer Bar
3,Downtown Toronto,-79.375418,3,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Gym,Bakery,Beer Bar
5,Downtown Toronto,-79.373306,3,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Gym,Bakery,Beer Bar
6,Downtown Toronto,-79.387383,3,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Gym,Bakery,Beer Bar
7,Downtown Toronto,-79.422564,3,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Gym,Bakery,Beer Bar
8,Downtown Toronto,-79.384568,3,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Gym,Bakery,Beer Bar
10,Downtown Toronto,-79.381752,3,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Gym,Bakery,Beer Bar
13,Downtown Toronto,-79.381576,3,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Gym,Bakery,Beer Bar


### as is evident from the tables above cluster 0 i.e. in central toronto coffee shops are most popular and is segregated from all the other clusters  italian restaurants and most importantly GIFT SHOPS, for cluster 1,Wesr Toronto, the favourite is bar followed by cafes and differentiated by bar as preference since no other cluster shares the preference, for cluster 2 ie. east Toronto the differentiator is the affinity for brewery places followed by downtown toronto wi