# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
!pip install BeautifulSoup4
!pip install lxml
!pip install geocoder

Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/cb/a1/c698cf319e9cfed6b17376281bd0efc6bfc8465698f54170ef60a485ab5d/beautifulsoup4-4.8.2-py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 7.2MB/s eta 0:00:01
[?25hCollecting soupsieve>=1.2 (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/81/94/03c0f04471fc245d08d0a99f7946ac228ca98da4fa75796c507f61e688c2/soupsieve-1.9.5-py2.py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.8.2 soupsieve-1.9.5
Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 8.5MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0


In [2]:
#importing libraries
import numpy as np
import pandas as pd
import json

import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize

import geocoder

from sklearn.cluster import KMeans

### Step1: Getting Data

In [3]:
#scraping data from wiki website
wiki="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
website_data = requests.get(wiki).text
# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(website_data, "lxml")
#print(soup.prettify())

In [4]:
#use the 'find_all' function to bring back all instances of the 'table' tag in the HTML
all_tables =soup.find_all('table')
#print(all_tables)

right_table =soup.find('table', class_='wikitable sortable')
#print(right_table)

In [35]:
#select and save info into list from html format 
A=[]
B=[]
C=[]

for row in right_table.find_all('tr'):
    cells = row.find_all('td')
    if len(cells) ==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True).rstrip())


In [36]:
#transfer to dataframe format
df =pd.DataFrame(data=A, columns=['Postcode'])
df['Borough']=B
df['Neighbourhood']=C
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Step2: Data Cleanup

In [37]:
#remove row with unassigned Borough
df=df[df['Borough'] != 'Not assigned']

#let unassigned Neighbourhood = its' Borough
df['Neighbourhood'].replace('Not assigned\n', df['Borough'], inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [38]:
df=df.groupby(["Postcode","Borough"])['Neighbourhood'].apply(lambda x: ', '.join(x)).reset_index()
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [39]:
df.shape

(103, 3)