### Import what I need...

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests

### Get the webpage

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"652df336-7a6c-4f10-8854-52123f8436fb","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":960187814,"wgRevisionId":960187814,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Communications in Ontario","Postal codes in Canada","Toron

### Get the table within and define the column names

In [4]:
my_table = soup.find('table', class_='wikitable')
column_names = [head.text.replace('\n','') for head in my_table.find_all('th')]

### Make an empty dataframe

In [8]:
neighborhoods = pd.DataFrame(columns = column_names)
neighborhoods

Unnamed: 0,Postal Code,Borough,Neighborhood


### Get the content inside the table and fill them into the dataframe

In [9]:
tds = [item.text.replace("<td>","").replace("<\td>", "").replace("\n","") for item in my_table.find_all('td')]
for i in range(0, len(tds)-1, 3):
    if tds[i+1]=="Not assigned":
        continue
    elif tds[i+2]=="Not assigned":
        s = pd.Series({column_names[0]:tds[i], column_names[1]:tds[i+1], column_names[2]:tds[i+1]})
    else:
        s = pd.Series({column_names[0]:tds[i], column_names[1]:tds[i+1], column_names[2]:tds[i+2]})
    neighborhoods = neighborhoods.append(s, ignore_index = True)
neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [10]:
neighborhoods.shape

(103, 3)

In [11]:
!wget -q -O 'geo.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [12]:
geo = pd.read_csv('geo.csv')

In [13]:
neighborhoods = neighborhoods.join(geo.set_index('Postal Code'), on='Postal Code')
neighborhoods

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### Find entries have "Toronto"

In [16]:
toronto_data = neighborhoods[neighborhoods.Borough.str.endswith('Toronto')]

In [17]:
toronto_data.sort_values(by=['Postal Code'], ascending=True, inplace=True)
toronto_data.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


### Some statistics

In [18]:
toronto_data.groupby('Borough').count()

Unnamed: 0_level_0,Postal Code,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,19,19,19,19
East Toronto,5,5,5,5
West Toronto,6,6,6,6


In [14]:
import folium

In [19]:
toronto_map = folium.Map(location=[toronto_data['Latitude'][2], toronto_data['Longitude'][2]], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

### Using the little information, I cluster these points by coordinates...

In [21]:
toronto_data.reset_index(drop=True, inplace=True)

In [22]:
toronto_clustering = toronto_data.drop(['Postal Code', 'Borough', 'Neighborhood'], 1)

In [23]:
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [24]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)

In [25]:
toronto_data.insert(0, 'Cluster Labels', kmeans.labels_)

In [26]:
toronto_data.head(10)

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,0,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,0,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,0,M4M,East Toronto,Studio District,43.659526,-79.340923
4,1,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,1,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
7,1,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,1,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,1,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


### Draw the map with the clustered points

In [28]:
map_clusters =  folium.Map(location=[toronto_data['Latitude'][2], toronto_data['Longitude'][2]], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood'], toronto_data['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters