Read in the libraries 

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

Get the data from Wikipedia

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_html = requests.get(url).text
soup = BeautifulSoup(wiki_html, 'html.parser')

In [4]:
data = []
for tr in soup.tbody.find_all('tr'):
    data.append([ td.get_text().strip() for td in tr.find_all('td')])

Put the data into a Pandas dataframe

In [5]:
df=pd.DataFrame(data,columns=['PostalCode','Borough','Neighborhood2'])

In [6]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood2
0,,,
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
...,...,...,...
176,M5Z,Not assigned,
177,M6Z,Not assigned,
178,M7Z,Not assigned,
179,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


Replace slashes with commas

In [7]:
df['Neighborhood2'] = df['Neighborhood2'].str.replace(' /',',')

In [8]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood2
0,,,
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
...,...,...,...
176,M5Z,Not assigned,
177,M6Z,Not assigned,
178,M7Z,Not assigned,
179,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


Clean the Borough column

In [9]:
indexNames = df[(df['Borough'] == "Not assigned")].index

In [10]:
df.drop(indexNames,inplace=True)

In [11]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood2
0,,,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"


In [12]:
df.dropna(inplace=True)

In [13]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood2
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Ensure the data is collapsed

In [14]:
df=df.groupby(['PostalCode','Borough'])['Neighborhood2'].apply(', '.join).reset_index()

In [15]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood2
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Clean the Neighborhood2 column

In [16]:
def custom_fx(data):
    if data['Neighborhood2']=='Not assigned':
        var=data['Borough']
    else:
        var=data['Neighborhood2']
    return var

In [17]:
df['Neighborhood']=df.apply(custom_fx,axis='columns')

In [18]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood2,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge","Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek","Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill","Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn,Woburn
4,M1H,Scarborough,Cedarbrae,Cedarbrae


Check to make sure there are no 'Not Assigned' records left

In [19]:
print("There are {} rows that have 'Not assigned' in Neighborhood column in the dataframe".format(
    len(df[df['Neighborhood']=='Not assigned'])
)
     )

There are 0 rows that have 'Not assigned' in Neighborhood column in the dataframe


Clean up the dataframe by dropping the Neighborhood2 column

In [20]:
df.drop(columns='Neighborhood2')

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


Print out the shape of the dataframe

In [20]:
print("The shape of the dataframe is {}. The dataset has {} rows.".format
      (df.shape,df.shape[0]))

The shape of the dataframe is (103, 4). The dataset has 103 rows.


In [1]:
df.to_csv(r'Segmenting_Clustering_Neighborhoods_Toronto.csv',index=None,header=True)

NameError: name 'df' is not defined