# Segmenting and clustering neighbourhoods in Toronto
### (Capstone project course on coursera)

# Part 1

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bsoup

### Toronto postal codes wikipedia page url:

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
page_req = requests.get(url)

### Reading html file and extracting table from it:

In [10]:
page = bsoup(page_req.text, "html.parser")
pgtable = page.table

In [14]:
results = pgtable.find_all("tr")
nrows = len(results)
nrows

181

In [38]:
headers = np.array(results[0].text.split("\n"))[[1,3,5]].tolist()
headers

['Postal Code', 'Borough', 'Neighbourhood']

### Creating dataframe of the table on wiki page:

In [48]:
records = []
n = 1

while n < nrows:
    current_row = results[n].text.split("\n")
    postcode = current_row[1]
    borough = current_row[3]
    nhood = current_row[5]
    records.append((postcode, borough, nhood))
    n = n+1
    
df = pd.DataFrame(records, columns = ["Postalcode", "Borough", "Neighbourhood"])
print(df.head(), "\n", df.shape)

  Postalcode           Borough              Neighbourhood
0        M1A      Not assigned               Not assigned
1        M2A      Not assigned               Not assigned
2        M3A        North York                  Parkwoods
3        M4A        North York           Victoria Village
4        M5A  Downtown Toronto  Regent Park, Harbourfront 
 (180, 3)


In [49]:
df.tail()

Unnamed: 0,Postalcode,Borough,Neighbourhood
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."
179,M9Z,Not assigned,Not assigned


### Dropping rows for which borough is not assigned:

In [56]:
df1 = df[~df.Borough.str.contains("Not assigned")]
df1 = df1.reset_index(drop = True)
df1.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Rows for which neighbourhood is not assigned, it is same as borough:

In [57]:
df1.loc[df1['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df1['Borough']
df1.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [62]:
df1['Postalcode'].nunique()

103

There are no repeated postal codes in the dataframe.

In [63]:
df1.head(15)

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [64]:
df1.shape

(103, 3)

------End of part 1------