# Week3: Part 1

# 1. Importing Libraries

Libraries to be used in the Capstone Project

In [4]:
import numpy as np
import pandas as pd
import json

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Libraries imported.')

Libraries imported.


# 2. Scraping

Getting the code from the wikipedia page and storing it into a object to after build a pandas data set

In [5]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(page, 'html.parser')
wiki_table = soup.body.table.tbody
print(wiki_table)

<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_York" tit

Creating functions, the first to get cells, and second to get the entire row to build the dataset appeding this rows.

In [6]:
def get_cell(obj):
    cells = obj.find_all('td')
    row = []
    
    for cell in cells:
        if cell.a:            
            if (cell.a.text):
                row.append(cell.a.text)
                continue
        row.append(cell.string.strip())
        
    return row

def get_row(table):    
    data = []  
    
    for tr in table.find_all('tr'):
        row = get_cell(tr)
        if len(row) != 3:
            continue
        data.append(row)        
    
    return data

Creating a pandas dataframe with only 3 columns according to the instructions of capstone project

In [7]:
data = get_row(wiki_table)
columns = ['Postcode', 'Borough', 'Neighbourhood']
df = pd.DataFrame(data, columns=columns)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# 3. Arranging the dataset
In this part, I'll treat some data issues, like dupplicate rows

First we exclude obsrvations with values of Borough not assigned, after we sort the dataset by code and Borough, after this we reset the index.

In [8]:
df1 = df[df.Borough != 'Not assigned']
df1 = df1.sort_values(by=['Postcode','Borough'])

df1.reset_index(inplace=True)
df1.drop('index',axis=1,inplace=True)

df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Highland Creek
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Port Union


We aggregate columns, adding one column if the Borough has 2 or more Neighbourhoods, after this, we agreggate it Neighbourhoods in a column separated by a ","

In [9]:
df2 = df1.groupby(["Postcode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
print(df1.shape)
print(df1.Postcode.unique().shape)

df2[df2["Postcode"]=="M7A"].head()
df2[df2['Neighbourhood']=='Not assigned']

(211, 3)
(103,)


Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Not assigned


Finally, if the row have Borough but not a Neighbourhood, we fill Neighbourhood with the value of the column Borough

In [10]:
for i in df2.index:
    s = df2.iloc[i, 2]
    if s == 'Not assigned':    
        df2.iloc[i,2] = df2.iloc[i,1]

df2.shape

(103, 3)

# Week3 : Part2

# 1. Getting Coordinates

In [11]:
import geocoder

In [12]:
def get_latlng(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    return lat_lng_coords
    
get_latlng('M4G')

[43.70976500000006, -79.36390090899994]

In [13]:
postal_codes = df2['Postcode']    
coords = [ get_latlng(postal_code) for postal_code in postal_codes.tolist() ]

In [14]:
df_coords =  pd.DataFrame(coords, columns = ['Latitude','Longitude'])
df2['Latitude']=df_coords['Latitude']
df2['Longitude']=df_coords['Longitude']
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.78573,-79.15875
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76569,-79.175256
3,M1G,Scarborough,Woburn,43.768359,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944


# Week3 : Part3


In [15]:
df_final = df2.copy(deep=True)

In [16]:
toronto_map = folium.Map(location=[43.65, -79.4], zoom_start=12)

X = df_final['Latitude']
Y = df_final['Longitude']
Z = np.stack((X, Y), axis=1)

kmeans = KMeans(n_clusters=4, random_state=0).fit(Z)

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow']
df_final['Cluster'] = clusters

for latitude, longitude, borough, cluster in zip(df_final['Latitude'], df_final['Longitude'], df_final['Borough'], df_final['Cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(toronto_map)  

toronto_map