# Segmenting and Clustering Neighborhoods in Toronto | Part 1

In [1]:
import pandas as pd 
import numpy as np 

### Scraping Wikipedia Table

In [2]:
# Scraping Wikipedia Table with pandas
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)[0]
print(type(df)) # Making sure it is a pandas dataframe
df.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Cleaning Dataset 

In [3]:
# Drop cells that don't have an assigned borough
df = df[df["Borough"] != "Not assigned"]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [4]:
# Combining Duplicate Postal Codes 
df = df.groupby(['Postal Code','Borough']).agg({'Neighbourhood': ', '.join}).reset_index()

df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [5]:
# Replace cells with no assigned neighbourhoods with borough
for index, row in df.iterrows():
    if row["Neighbourhood"] == "Not assigned":
        row["Neighbourhood"] = row["Borough"]

In [6]:
# Save data table to be used for the next section
df.to_csv('capstone_canada.csv')


In [7]:
print("The dimensions of the dataframe is", df.shape)

The dimensions of the dataframe is (103, 3)
