# Web Scrapping 
We start scrapping the neighborhood data from the wikipedia page

In [1]:
#Load the packages required to scrap the data
from bs4 import BeautifulSoup
import requests

In [2]:
page_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
page_response=requests.get(page_link,timeout=5)
page_content=BeautifulSoup(page_response.content,"html.parser")

In [4]:
#print(page_content.prettify())

In [5]:
table_content=page_content.find('table',class_='wikitable sortable')
#print(table_content.prettify())

In [6]:
#Create a pandas dataframe
import pandas as pd
column_names=['PostalCode','Borough','Neighbourhood']


#Parsing the data stored in the table
Raw_table=table_content.find_all('tr')

#Define size of table using the number of raws obtained
length=len(Raw_table)
Toronto=pd.DataFrame(index=range(0,length), columns=column_names)

row_num=0
for row_value in Raw_table:
    row_values=row_value.find_all('td')
    column_num=0
    for value in row_values:
        text=value.get_text()
        text=text.replace('\n', '')
        Toronto.iat[row_num,column_num]=text
        column_num=column_num+1
    row_num=row_num+1

print(Toronto.head())
print(Toronto.shape)

  PostalCode       Borough     Neighbourhood
0        NaN           NaN               NaN
1        M1A  Not assigned      Not assigned
2        M2A  Not assigned      Not assigned
3        M3A    North York         Parkwoods
4        M4A    North York  Victoria Village
(289, 3)


# Data filtering
#### We proceed to clean the data as specified:
1- Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

2- More than one neighborhood can exist in one postal code area.These two rows will be combined into one row with the neighborhoods separated with a comma.

3- If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

### Condition number 1

In [7]:
#Calculate index meet the condition
Toronto.dropna(axis=0, inplace=True)

Index_Notassigned=Toronto[Toronto['Borough']=='Not assigned'].index.values
print("Number of rows meeting condition Borough is 'Not assigned':",len(Index_Notassigned))

Number of rows meeting condition Borough is 'Not assigned': 77


In [8]:

Num=0
#Eliminate the ones with text=Not assigned.
for values in Index_Notassigned:
    Toronto.drop([values],axis=0,inplace=True)
    Num=Num+1
print("The number of rows that have been deleted are:",Num)


The number of rows that have been deleted are: 77


### Condition number 3

In [9]:
Index_Notassigned2=Toronto[Toronto['Neighbourhood']=='Not assigned'].index.values
print("Number of rows meeting condition Neighbourhood is 'Not assigned':",len(Index_Notassigned2))

Number of rows meeting condition Neighbourhood is 'Not assigned': 1


In [10]:
Num=0
for values in Index_Notassigned2:
    Toronto.drop([values],axis=0,inplace=True)
    Num=Num+1
print("The number of rows that have been deleted are:",Num)

The number of rows that have been deleted are: 1


In [23]:
print(Toronto.head())
print(Toronto.shape)

  PostalCode           Borough     Neighbourhood
3        M3A        North York         Parkwoods
4        M4A        North York  Victoria Village
5        M5A  Downtown Toronto      Harbourfront
6        M5A  Downtown Toronto       Regent Park
7        M6A        North York  Lawrence Heights
(210, 3)


### Condition number 2

In [41]:
df=Toronto
#group the data by PostalCode
df['Neighbourhood']=df[['PostalCode','Borough','Neighbourhood']].groupby(['PostalCode','Borough'])['Neighbourhood'].transform(lambda x: ','.join(x))
Toronto=df[['PostalCode','Borough','Neighbourhood']].drop_duplicates()

## Resulting table with filtered data

In [42]:
Toronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Harbourfront,Regent Park,Harbourfront,Regent P..."
7,M6A,North York,"Lawrence Heights,Lawrence Manor,Lawrence Heigh..."
11,M9A,Etobicoke,Islington Avenue


## Shape of the table
The number of Neighbourhoods is equal to the number of rows on the table

In [43]:
Toronto.shape

(102, 3)