# Segmenting and Clustering Neighborhoods in Toronto

##  Applied Data Science Capstone Week 3 Project
### Part 1 - Data Preparation

---

In [1]:
#Import libraries
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

In [2]:
#Scrape Wiki page using beautifulsoup
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
tables = soup.find('table', {'class':'wikitable sortable'})

In [3]:
# Obtain all the rows in the table (including headers)
Toronto_FSA = [] #create an empty list to store data
for tr in tables.find_all('tr'): #for every table row in tables
    Toronto_FSA.append(tr.text.strip().split("\n")) #append each row (strip only text and seperate by \n) to empty list created above.

Toronto_FSA[0:5] #print the first 5 rows.

[['Postcode', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village']]

In [4]:
#Convert array to panda dataframe
Toronto_df = pd.DataFrame(Toronto_FSA)
Toronto_df.head() 

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [5]:
#replace the header with first row
new_header = Toronto_df.iloc[0] #grab the first row for the header
Toronto_df = Toronto_df[1:] #take the data less the header row
Toronto_df.columns = new_header #set the header row as the df header

In [6]:
#Print first 10 rows of the dataframe
Toronto_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
10,M8A,Not assigned,Not assigned


In [7]:
#Check dataframe shape
Toronto_df.shape

(288, 3)

In [8]:
#Remove Borough = "Not assigned" rows
Toronto_df=Toronto_df[Toronto_df.Borough != 'Not assigned']
Toronto_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


In [9]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
Toronto_df.loc[Toronto_df['Neighbourhood']== 'Not assigned', ['Neighbourhood']] = Toronto_df['Borough']
Toronto_df.reset_index(drop=True, inplace=True)
Toronto_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [10]:
#check dataframe shape
Toronto_df.shape

(211, 3)

In [11]:
#Combine two rows with same postcode into one and merge Neighourhood with comma
Toronto_df = Toronto_df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
Toronto_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [12]:
print (Toronto_df.shape)

(103, 3)


In [13]:
# Save cleaned dataset to csv for part 2
Toronto_df.to_csv('Toronto_FSAs.csv', index = False)