## Assignment Part I - Segmenting and Clustering Neighborhoods in Toronto

### Load libraries

In [1]:
import pandas as pd
import numpy as np

### Scrape the Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

In [3]:
# Load libraries
import requests
from bs4 import BeautifulSoup

req = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(req,"lxml")
#print(soup.prettify())

wikitables = soup.find_all("table",{"class":"wikitable sortable"})
#print(wikitables)

# Search through the tables for the one with the headings we want.
for table in wikitables:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    if headings[:3] == ['Postcode', 'Borough', 'Neighborhood']:
        break

# Extract the columns we want and write to a semicolon-delimited text file.
with open('postalcode_canada', 'w') as fo:
    for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if not tds:
            continue
        postcode, borough, neighborhood = [td.text.strip() for td in tds[:3]]
        print('; '.join([postcode, borough, neighborhood]), file=fo)

In [4]:
# Load the dataframe from the semicolon-delimited text file
pc_table = pd.read_csv('postalcode_canada', sep=';', header=None)
pc_table.columns=["Postcode","Borough","Neighborhood"]

# Strip all whitespace from text fields in the dataframe
pc_table["Borough"]=pc_table["Borough"].str.strip()
pc_table["Neighborhood"]=pc_table["Neighborhood"].str.strip()

# Exclude unassigned boroughs
pc_table = pc_table[pc_table["Borough"] != "Not assigned"]
pc_table.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [5]:
# More than one neighborhood can exist in one postal code area. 
pc_table = pc_table.groupby(by=["Postcode","Borough"], as_index=False, sort=False).agg( ', '.join)
pc_table.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [6]:
# If a cell has a borough but an unassigned neighborhood, then the neighborhood will be the same as the borough.
for index,row in pc_table.iterrows():
   if row["Neighborhood"]=="Not assigned":
    row["Neighborhood"]=row["Borough"]
pc_table.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [7]:
# Print number of rows in dataframe
pc_table.shape

(103, 3)

### Store Output from Part I to a File

In [8]:
# Save dataframe for part 1 of the assignment
pc_table.to_csv("Toronto1.csv", index=False)