**Importing required libraries**

In [1]:
import pandas as pd
import numpy as np
import folium
import geocoder
from geopy.geocoders import Nominatim
import json
from bs4 import BeautifulSoup
import requests
from sklearn.cluster import KMeans
import matplotlib.cm as cm 
import matplotlib.colors as colors 
from pandas.io.json import json_normalize
import warnings

warnings.filterwarnings('ignore')
print('done')

done


In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
results = requests.get(url).text

**Using BeautifulSoup to convert the request results into xml**

In [4]:
soap = BeautifulSoup(results,'lxml')

In [5]:
data = soap.find('table',{'class':'wikitable sortable'}).findAll('td')
print('total number of postal code records including Not Assigned : ', len(data)/3)

total number of postal code records including Not Assigned :  289.0


In [6]:
postal_code = []
borough = []
neighborhood = []
i=0
while i < len(data):
    if data[i+1].contents[0] !='Not assigned':
        if data[i].contents[0] in postal_code:
            index = postal_code.index(data[i].contents[0])
            neighborhood[index] = neighborhood[index] + ', ' + data[i+2].find('a').get('title') if data[i+2].find('a') else data[i+1].find('a').get('title')
        else:
            postal_code.append(data[i].contents[0])
            temp_borough = data[i+1].find('a').get('title') if data[i+1].find('a') else data[i+1].contents[0]
            borough.append(temp_borough)
            hood = data[i+2].find('a').get('title') if data[i+2].find('a') else data[i+2].contents[0]
            neighborhood.append(hood if hood !='Not assigned' else temp_borough)
    i = i+3
print('Extracting values from data is done')        

Extracting values from data is done


In [7]:
postal_data = pd.DataFrame()
postal_data['PostalCode'] = postal_code
postal_data['Borough'] = borough
postal_data['Neighborhood'] = neighborhood
postal_data.shape

(103, 3)

In [8]:
postal_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront (Toronto), Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park (Toronto),Not assigned


In [9]:
postal_data[postal_data['PostalCode'] == 'M5V']['Neighborhood']

87    Downtown Toronto, King and Spadina, Railway La...
Name: Neighborhood, dtype: object

In [10]:
lon_lat = pd.read_csv('Geospatial_Coordinates.csv')
print('total unique postal values :', lon_lat['Postal Code'].nunique())
lon_lat.head()

total unique postal values : 103


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
latitude = []
longitude = []
postal = postal_data['PostalCode'].values
for value in postal:
    temp = lon_lat[lon_lat['Postal Code'] == value]
    latitude.append(temp['Latitude'].values[0])
    longitude.append(temp['Longitude'].values[0])
    
postal_data['Latitude'] = latitude
postal_data['Longitude'] = longitude

postal_data.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront (Toronto), Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park (Toronto),Not assigned,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,"Scarborough, Toronto","Rouge, Toronto, Malvern, Toronto",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,Downtown Toronto,43.657162,-79.378937


In [12]:
Toronto = postal_data[postal_data['Borough'].str.contains('Toronto')]
Toronto = Toronto.sort_values(by=['Neighborhood'])
print(Toronto.shape)
Toronto.head()

(56, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
78,M1S,"Scarborough, Toronto","Agincourt, Toronto",43.7942,-79.262029
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
43,M6K,West Toronto,"Brockton\n, Exhibition Place, Parkdale Village",43.636847,-79.428191
100,M7Y,East Toronto,Business reply mail Processing Centre969 Eastern,43.662744,-79.321558
96,M4X,Downtown Toronto,"Cabbagetown, Toronto, St. James Town",43.667967,-79.367675


In [13]:
#Toronto['Longitude'].nunique()
x = Toronto.groupby(['Neighborhood','Latitude','Longitude']).count().reset_index()
x.shape

(56, 5)

In [14]:
CLIENT_ID = 'SO1C150DV4X5TTIF0STVVODGYUDAFV3ZICKZQXAHECMZV41K' # your Foursquare ID
CLIENT_SECRET = 'UNS3NE2PQC5YCXSEH5TBWVRGCRZ5IVIT4B4ATLO2WOEXL0L3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100
radius = 500


def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = None
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        if results is None:
            print('None result is :',name)
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
toronto_venues = getNearbyVenues(names=Toronto['Neighborhood'].values,
                                   latitudes=Toronto['Latitude'].values,
                                   longitudes=Toronto['Longitude'].values
                                  )
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Agincourt, Toronto",43.7942,-79.262029,Panagio's Breakfast & Lunch,43.79237,-79.260203,Breakfast Spot
1,"Agincourt, Toronto",43.7942,-79.262029,Twilight,43.791999,-79.258584,Lounge
2,"Agincourt, Toronto",43.7942,-79.262029,Mark's,43.791179,-79.259714,Clothing Store
3,"Agincourt, Toronto",43.7942,-79.262029,Commander Arena,43.794867,-79.267989,Skating Rink
4,Berczy Park,43.644771,-79.373306,LCBO,43.642944,-79.37244,Liquor Store


In [16]:
toronto_venues.shape

(1833, 7)

In [17]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
toronto_onehot['Neighborhood Latitude'] = toronto_venues['Neighborhood Latitude']
toronto_onehot['Neighborhood Longitude'] = toronto_venues['Neighborhood Longitude'] 
# move neighborhood column to the first column
fixed_columns = ['Neighborhood'] + list(col for col in toronto_onehot.columns if col != 'Neighborhood')

toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.shape

(1833, 244)

Grouped by 'Neighborhood','Neighborhood Latitude','Neighborhood Longitude' because we have few cases where for a given Neighborhood we have multiple Latitude and Longitude

In [18]:
toronto_grouped = toronto_onehot.groupby(['Neighborhood','Neighborhood Latitude','Neighborhood Longitude']).mean().reset_index()
print(toronto_grouped.shape)
toronto_grouped.head()

(55, 244)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,"Agincourt, Toronto",43.7942,-79.262029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Berczy Park,43.644771,-79.373306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton\n, Exhibition Place, Parkdale Village",43.636847,-79.428191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business reply mail Processing Centre969 Eastern,43.662744,-79.321558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824
4,"Cabbagetown, Toronto, St. James Town",43.667967,-79.367675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Dividing the data into 5 clusters

In [19]:
kclusters = 5
toronto_grouped_cluster = toronto_grouped.drop(['Neighborhood','Neighborhood Latitude','Neighborhood Longitude'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=99).fit(toronto_grouped_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [20]:
address = 'Toronto, ON'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [21]:
len(kmeans.labels_)

55

In [22]:
toronto_merged = toronto_grouped[['Neighborhood','Neighborhood Latitude','Neighborhood Longitude']]

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_
toronto_merged.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Cluster Labels
0,"Agincourt, Toronto",43.7942,-79.262029,0
1,Berczy Park,43.644771,-79.373306,0
2,"Brockton\n, Exhibition Place, Parkdale Village",43.636847,-79.428191,0
3,Business reply mail Processing Centre969 Eastern,43.662744,-79.321558,0
4,"Cabbagetown, Toronto, St. James Town",43.667967,-79.367675,0


**Please note unable to show the map in Github**

In [23]:
#create map
#Zoom is 10 as 11 is missing few clusters in the image
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Neighborhood Latitude'], toronto_merged['Neighborhood Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [24]:
toronto_merged['Cluster Labels'].value_counts()

0    49
1     3
4     1
3     1
2     1
Name: Cluster Labels, dtype: int64

**Below are the Neighborhood outliers(meaning which are singled out) based on clustering**

In [27]:
toronto_merged.loc[toronto_merged['Cluster Labels'].isin([2,3,4])]['Neighborhood']

28    Highland Creek (Toronto), Rouge Hill, Port Uni...
37                                            Roselawn

38                     Rouge, Toronto, Malvern, Toronto
Name: Neighborhood, dtype: object