In [77]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# 1. WebScraping to Get Data

In [78]:
# passing the wikipedia url where we have the data tables
Toronto_neighbors_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}

req = requests.get(Toronto_neighbors_url, headers) # Get the data using requests
soup = BeautifulSoup(req.content, 'html.parser')  #Creating BeautifulSoup object to pull data in html format
soup.contents

['html',
 '\n',
 <html class="client-nojs" dir="ltr" lang="en">
 <head>
 <meta charset="utf-8"/>
 <title>List of postal codes of Canada: M - Wikipedia</title>
 <script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"d63c3b66-29e7-4a77-97b7-1caec4713e98","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":1029579868,"wgRevisionId":1029579868,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Communications in Onta

In [79]:
table_contents = []
table = soup.find('table')  #now using soup object we can acess the tables present in data

for row in table.findAll('td'):  #getting the td->table data of each row
    cell = {}  #creating an empty dictionary 
    if row.span.text == 'Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3] #filling the data(value) of 'postalCode' giving this as key to the dictionary 
        cell['Borough'] = (row.span.text).split('(')[0] #same
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace('/',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)
print(table_contents[:5]) #now we have the data of each row in this list object but the list elements are dictionaries  
df = pd.DataFrame(table_contents)  #convert that list to dataframe
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df.head()

[{'PostalCode': 'M3A', 'Borough': 'North York', 'Neighborhood': 'Parkwoods'}, {'PostalCode': 'M4A', 'Borough': 'North York', 'Neighborhood': 'Victoria Village'}, {'PostalCode': 'M5A', 'Borough': 'Downtown Toronto', 'Neighborhood': 'Regent Park , Harbourfront'}, {'PostalCode': 'M6A', 'Borough': 'North York', 'Neighborhood': 'Lawrence Manor , Lawrence Heights'}, {'PostalCode': 'M7A', 'Borough': "Queen's Park", 'Neighborhood': 'Ontario Provincial Government'}]


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [80]:
df['Neighborhood'] = df.groupby('PostalCode')['Neighborhood'].transform(lambda x: ','.join(x))
df = df.drop_duplicates()

# 2. Get The Latitude and Longitude Coordinates of Each From Geospatial_coordinates.csv

In [81]:
Geospatial_coordinates_df = pd.read_csv("Geospatial_coordinates.csv")
Geospatial_coordinates_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [82]:
df = df.merge(Geospatial_coordinates_df, left_on = 'PostalCode', right_on='Postal Code')
df = df.drop(df.columns[3], axis=1)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


# Explore Cluster The Neighborhoods in Toronto

In [83]:
toronto_df = df[df['Borough'].str.contains('Toronto')]
toronto_df = toronto_df.reset_index()
toronto_df = toronto_df.drop('index',axis = 1)
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [84]:
toronto_df.groupby('Borough').count()

Unnamed: 0_level_0,PostalCode,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,17,17,17,17
Downtown Toronto Stn A,1,1,1,1
East Toronto,4,4,4,4
East Toronto Business,1,1,1,1
East York/East Toronto,1,1,1,1
West Toronto,6,6,6,6


In [85]:
for borough in list(set(toronto_df['Borough'])):
    borough_neighborhood_df = toronto_df[toronto_df['Borough']==borough]
    print(borough_neighborhood_df)
    print("\n--------------------------------------\n")

   PostalCode                 Borough    Neighborhood   Latitude  Longitude
34        M5W  Downtown Toronto Stn A  Enclave of M5E  43.646435 -79.374846

--------------------------------------

   PostalCode           Borough  \
0         M5A  Downtown Toronto   
1         M5B  Downtown Toronto   
2         M5C  Downtown Toronto   
4         M5E  Downtown Toronto   
5         M5G  Downtown Toronto   
6         M6G  Downtown Toronto   
7         M5H  Downtown Toronto   
10        M5J  Downtown Toronto   
13        M5K  Downtown Toronto   
16        M5L  Downtown Toronto   
27        M5S  Downtown Toronto   
30        M5T  Downtown Toronto   
32        M5V  Downtown Toronto   
33        M4W  Downtown Toronto   
35        M4X  Downtown Toronto   
36        M5X  Downtown Toronto   
37        M4Y  Downtown Toronto   

                                         Neighborhood   Latitude  Longitude  
0                          Regent Park , Harbourfront  43.654260 -79.360636  
1                   

In [86]:
# Create Map using folium

import folium
map_network = folium.Map(location = [43.6532, 79.3832], zoom_start =0)

# add markers to map
for lat,lng,borough,neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_tml=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_network)

map_network

# Clustering

In [87]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_df.drop('Neighborhood', axis=1)
toronto_grouped_clustering = toronto_grouped_clustering.drop('PostalCode', axis=1)
toronto_grouped_clustering = toronto_grouped_clustering.drop('Borough', axis=1)

#run k-means clustering
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(toronto_grouped_clustering)
kmeans.labels_[0:10]

array([4, 4, 4, 1, 4, 4, 0, 4, 3, 1])

In [88]:

toronto_df.insert(0,'Cluster Labels', kmeans.labels_)

toronto_df

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,4,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
1,4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,1,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,4,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,0,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,4,M5H,Downtown Toronto,"Richmond , Adelaide , King",43.650571,-79.384568
8,3,M6H,West Toronto,"Dufferin , Dovercourt Village",43.669005,-79.442259
9,1,M4J,East York/East Toronto,The Danforth East,43.685347,-79.338106


In [89]:
# create map
map_clusters = folium.Map(location = [43.6532, 79.3832], zoom_start=0)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to map
markers_colors=[]
for lat,lng,neighborhood,cluster in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Neighborhood'], toronto_df['Cluster Labels']):
    label = folium.Popup(str(neighborhood)+', Cluster'+str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = rainbow[cluster-1],
        fill=True,
        fil_color = rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    

map_clusters