In [1]:
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pandas as pd

print('all is imported')

all is imported


Scrape data from url and select the table values

In [91]:
url ="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text
soup = BeautifulSoup(source,'lxml')
table = soup.find('table')

Parse all data and convert it to a dataframe with 

In [101]:
codes = table.findAll('tr')
links = table.findAll('a')

Postcode = []
Borough = []
Neighbourhood = []

for i in range(1, 288):
    Postcode.append(codes[i].contents[1].text)
    Borough.append(codes[i].contents[3].text)
    
    value = (codes[i].contents[5].text).split('\n') #remove the \n character
    Neighbourhood.append(value[0])

# Create dataframe from list
df = pd.DataFrame({'Postal Code':Postcode,'Borough':Borough,'Neighbourhood': Neighbourhood })
# Remove unassigned Borough's
df2 = df[df.Borough != 'Not assigned']
# Assumption: No Neighbourhoods without an assigned value
df3 = df2.reset_index(drop=True)

Find duplicates within the table and combine Neighbourhoods

In [102]:
initlen = len(df3.index)
index_drop = []

for i in range(0, initlen):
    val = df3.iloc[i]['Postal Code']
    initialNeigh = df3.iloc[i]['Neighbourhood']
    #print('N1: '+ initialNeigh)
    for j in range(i+1, initlen):
        val2 = df3.iloc[j]['Postal Code']
        Neigh2 = df3.iloc[j]['Neighbourhood']
        if val == val2:
            #print('N2 : '+Neigh2)
            #print('Zip: '+ val, val2)
            NewNeigh = initialNeigh + ", " + Neigh2
            initialNeigh = NewNeigh
            df3['Neighbourhood'][i]= NewNeigh
            index_drop.append(j)

Drop all but the first duplicate rows and show final Dataframe

In [111]:
df4 = df3.drop(index_drop)
df5 = df4.reset_index(drop = True) # reset indices
df5 #final dataframe

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


Check if the last value contains all neighbourhoods

In [71]:
df5['Neighbourhood'][102]

'Kingsway Park South West, Mimico NW, The Queensway West, Royal York South West, South of Bloor'

Check shape of the dataframe

In [99]:
df5.shape

(103, 3)

Obtain Latitude and Longitude for each Neighbourhood from csv

In [107]:
data = pd.read_csv('http://cocl.us/Geospatial_data')


Merge dataframe and geo dataframe

In [109]:
df6 = df5.merge(data, on='Postal Code')
df6.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


Show map with boroughs

In [139]:
import folium

m = folium.Map(location=[43.651070, -79.347015 ], zoom_start = 9)

for i in range(0,len(data)):
    folium.Marker([df6.iloc[i]['Latitude'],df6.iloc[i]['Longitude']], popup=df6.iloc[i]['Postal Code']).add_to(m)

m

Subset selection from data which contains "Toronto"

In [192]:
df7 = df6[df6['Borough'].str.contains('Toronto')]
df8 = df7.reset_index(drop= True)

m2 = folium.Map(location=[43.651070, -79.347015 ], zoom_start = 10)

for i in range(0,len(df8)):
    folium.Marker([df8.iloc[i]['Latitude'],df8.iloc[i]['Longitude']], popup=df8.iloc[i]['Postal Code']).add_to(m2)

m2

Clustering of the remaining data

In [210]:
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [222]:
# set number of clusters
kclusters = 7
# run k-means clustering
x = df8.as_matrix(['Latitude', 'Longitude'])
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(x)
# check cluster labels generated for each row in the dataframe
df8['labels'] = kmeans.labels_
df8.head()



Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,labels
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,6
1,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,1
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,1
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,2


Plotting the labels of each cluster on map

In [223]:
df8['labels'].astype(int)

map_clusters = folium.Map(location=[43.651070, -79.347015 ], zoom_start=11)# set color scheme for the clusters

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))

rainbow = [colors.rgb2hex(i) for i in colors_array]

for cluster in range(0,kclusters): 
    group = folium.FeatureGroup(name='<span style=\\"color: {0};\\">{1}</span>'.format(rainbow[cluster-1],cluster))
    for lat, lon, poi, label in zip(df8['Latitude'], df8['Longitude'], df8['Borough'], df8['labels']):
        if int(label) == cluster: 
            label = folium.Popup('ORIG. '+ str(poi) + 'Cluster ' + str(cluster), parse_html=True)
            folium.CircleMarker(
                (lat, lon),
                radius=5,
                popup=label,
                color=rainbow[cluster-1],
                fill=True,
                fill_color=rainbow[cluster-1],
                fill_opacity=0.7).add_to(group)
    group.add_to(map_clusters)
    
folium.map.LayerControl('topright', collapsed=False).add_to(map_clusters)
map_clusters

7 Clusters segment the Boroughs of Toronto which contain the subphrase 'Toronto' the best from a geographical point of view