# Import the relevant Libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Insert Wiki URL

In [2]:
wiki_url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(wiki_url).text

## Beautiful Soup to Parse the url page

In [3]:
soup = BeautifulSoup(response, 'xml')

In [4]:
# Target table
table=soup.find('table')

## Create dataframe with three columns: PostalCode, Borough, and Neighborhood

In [5]:
column_names=['Postalcode','Borough','Neighbourhood']
df = pd.DataFrame(columns=column_names)

In [6]:
df

Unnamed: 0,Postalcode,Borough,Neighbourhood


## Extracting information from the table and input into dataframe

In [7]:
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [8]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Ignore cells with a borough that is Not assigned.

In [9]:
df=df[df['Borough']!='Not assigned']

## If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [10]:
df[df['Neighbourhood']=='Not assigned']=df['Borough']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.loc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_array(key, value)


## To solve the issue that more than one neighborhood can exist in one postal code area. 

In [11]:
# Group multiple neighbourhoods into one Postcode
temp=df.groupby('Postalcode')['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x))
temp=temp.reset_index(drop=False)
temp.rename(columns={'Neighbourhood':'joined'},inplace=True)
# Join the new data frame
df_merge = pd.merge(df, temp, on='Postalcode')
# Drop the Neighbourhood column
df_merge.drop(['Neighbourhood'],axis=1,inplace=True)
# Drop duplicates
df_merge.drop_duplicates(inplace=True)
# Rename columns
df_merge.rename(columns={'joined':'Neighbourhood'},inplace=True)

In [12]:
df_merge.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,Queen's Park,Queen's Park,Queen's Park


## Dataframe Shape

In [13]:
df_merge.shape

(103, 3)

In [14]:
df_merge

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,Queen's Park,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,"Rouge, Malvern"
10,M3B,North York,Don Mills North
11,M4B,East York,"Woodbine Gardens, Parkview Hill"
13,M5B,Downtown Toronto,"Ryerson, Garden District"
