# Read: Please scroll down for the second item of the submission
## Web Scrapping 
We start scrapping the neighborhood data from the wikipedia page

In [1]:
#Load the packages required to scrap the data
from bs4 import BeautifulSoup
import requests

In [2]:
page_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
page_response=requests.get(page_link,timeout=5)
page_content=BeautifulSoup(page_response.content,"html.parser")

In [4]:
#print(page_content.prettify())

In [5]:
table_content=page_content.find('table',class_='wikitable sortable')
#print(table_content.prettify())

In [6]:
#Create a pandas dataframe
import pandas as pd
column_names=['PostalCode','Borough','Neighbourhood']


#Parsing the data stored in the table
Raw_table=table_content.find_all('tr')

#Define size of table using the number of raws obtained
length=len(Raw_table)
Toronto=pd.DataFrame(index=range(0,length), columns=column_names)

row_num=0
for row_value in Raw_table:
    row_values=row_value.find_all('td')
    column_num=0
    for value in row_values:
        text=value.get_text()
        text=text.replace('\n', '')
        Toronto.iat[row_num,column_num]=text
        column_num=column_num+1
    row_num=row_num+1

print(Toronto.head())
print(Toronto.shape)

  PostalCode       Borough     Neighbourhood
0        NaN           NaN               NaN
1        M1A  Not assigned      Not assigned
2        M2A  Not assigned      Not assigned
3        M3A    North York         Parkwoods
4        M4A    North York  Victoria Village
(289, 3)


# Data filtering
#### We proceed to clean the data as specified:
1- Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

2- More than one neighborhood can exist in one postal code area.These two rows will be combined into one row with the neighborhoods separated with a comma.

3- If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

### Condition number 1

In [7]:
#Calculate index meet the condition
Toronto.dropna(axis=0, inplace=True)

Index_Notassigned=Toronto[Toronto['Borough']=='Not assigned'].index.values
print("Number of rows meeting condition Borough is 'Not assigned':",len(Index_Notassigned))

Number of rows meeting condition Borough is 'Not assigned': 77


In [8]:

Num=0
#Eliminate the ones with text=Not assigned.
for values in Index_Notassigned:
    Toronto.drop([values],axis=0,inplace=True)
    Num=Num+1
print("The number of rows that have been deleted are:",Num)


The number of rows that have been deleted are: 77


### Condition number 3

In [9]:
Index_Notassigned2=Toronto[Toronto['Neighbourhood']=='Not assigned'].index.values
print("Number of rows meeting condition Neighbourhood is 'Not assigned':",len(Index_Notassigned2))

Number of rows meeting condition Neighbourhood is 'Not assigned': 1


In [10]:
Num=0
for values in Index_Notassigned2:
    Toronto.loc[values,'Neighbourhood']= Toronto.loc[values,'Borough']
    Num=Num+1
print("The number of rows that have been changed are:",Num)

The number of rows that have been changed are: 1


In [11]:
print(Toronto.head())
print(Toronto.shape)

  PostalCode           Borough     Neighbourhood
3        M3A        North York         Parkwoods
4        M4A        North York  Victoria Village
5        M5A  Downtown Toronto      Harbourfront
6        M5A  Downtown Toronto       Regent Park
7        M6A        North York  Lawrence Heights
(211, 3)


### Condition number 2

In [12]:
df=Toronto
#group the data by PostalCode
df['Neighbourhood']=df[['PostalCode','Borough','Neighbourhood']].groupby(['PostalCode','Borough'])['Neighbourhood'].transform(lambda x: ','.join(x))
Toronto=df[['PostalCode','Borough','Neighbourhood']].drop_duplicates()

## Resulting table with filtered data

In [13]:
Toronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Harbourfront,Regent Park"
7,M6A,North York,"Lawrence Heights,Lawrence Manor"
9,M7A,Queen's Park,Queen's Park


## Shape of the table
The number of Neighbourhoods is equal to the number of rows on the table

In [14]:
Toronto.shape

(103, 3)

# Item 2 of the submission - Geolocation.
As I did not manage to make the geocoder function to converge into a lat lon position data, I will be using the file available

In [15]:
import pandas as pd
Location_data=pd.read_csv( 'https://cocl.us/Geospatial_data')
Location_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
Location_data.shape

(103, 3)

In [17]:
Location_data.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
#Location_data.head()

In [19]:
Torontofull=pd.merge(Toronto,Location_data, on='PostalCode')
Torontofull.head()


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [20]:
print(Torontofull.shape)

(103, 5)
