 # **Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto**
 <br/><br/>
 *Author-Student: David Gerard*
 _____
 _____

 We first import the libraries we need
 _____

In [27]:

from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np



 We can obtain the source file using the 'get' command from the requests library.
 I first create a variable containing the link to the webpage.
 Then I create another variable to contain the response of the 'get' command.
 I pass the text only of the webpage into the 'WebText' variable
 and finally I use 'BeautifulSoup()' to parse the HTML text and pass it to 'htmlContent'
 _____

In [28]:
WebLink = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
WebPage = requests.get(WebLink)
WebText = WebPage.text
htmlContent = BeautifulSoup(WebText, 'lxml')



 From the 'htmlContent' we extract the table containing the data we want
 and we pass it to the variable dataTable.
 <br/>
 We identify the table we want using the 'class' attribute 'wikitable sortable'
 _____

In [29]:
dataTable = htmlContent.find('table', attrs={"class":"wikitable sortable"})



 Now that we have easy access to our web table we can iterate through each row
 identified by the HTML tags 'tr' and extract the text from each cell.
 _____

In [30]:
# We can first create an array of all rows identified by the HTML tag 'tr'
tableRows = dataTable.find_all('tr')

# Now that we have an array of all rows, we can access the first element 
# and extract the headers (corresponding to the first row in the table)
# The first row is being accessed by its index (0)

TableHeaders = [] # array variable for headers initiated
for eachElement in tableRows[0].find_all('th'):
    TableHeaders.append(eachElement.text.replace('\n', '').strip())

theTable = [] # array variable for table initiated
for eachRow in tableRows:
    TableRow = {} # dictionnary variable initiated
    for eachCell, eachHeader in zip(eachRow.find_all('td'), TableHeaders): # header as key and cell data as value
        TableRow[eachHeader] = eachCell.text.replace('\n', '').strip() # remove line breaks and clean the string
    theTable.append(TableRow)

print(theTable[:5]) # print the first 5 elements of the dictionnary


[{}, {'Postcode': 'M1A', 'Borough': 'Not assigned', 'Neighbourhood': 'Not assigned'}, {'Postcode': 'M2A', 'Borough': 'Not assigned', 'Neighbourhood': 'Not assigned'}, {'Postcode': 'M3A', 'Borough': 'North York', 'Neighbourhood': 'Parkwoods'}, {'Postcode': 'M4A', 'Borough': 'North York', 'Neighbourhood': 'Victoria Village'}]


 Let's now pass the dictionnary 'theTable' into a pandas dataframe
 _____

In [31]:
import pandas as pd
df = pd.DataFrame(theTable)
df.head()


Unnamed: 0,Borough,Neighbourhood,Postcode
0,,,
1,Not assigned,Not assigned,M1A
2,Not assigned,Not assigned,M2A
3,North York,Parkwoods,M3A
4,North York,Victoria Village,M4A


 We can clean our dataframe
 <br/><br/>
 Let's apply the following:
  - delete empty row (drop na)
  - remove rows with 'Not assigned' value as Borough
  - replace 'Not assigned' value in Neighbourood column with value from Borough column
  - group by columns Postcode and Borough, and join values from Neighbouhood
 <br/><br/>
 Then we can finish by applying the .shape method to print the number of rows and columns in the dataframe

In [32]:
df.dropna(axis=0,inplace=True)
df = df[df.Borough !='Not assigned']
df['Neighbourhood'] = np.where(df['Neighbourhood'] == 'Not assigned', df['Borough'], df['Neighbourhood'])
dfResult = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(','.join).reset_index()
print(dfResult.head(n=10))
print(dfResult.shape)

Postcode      Borough                                  Neighbourhood
0      M1B  Scarborough                                  Rouge,Malvern
1      M1C  Scarborough           Highland Creek,Rouge Hill,Port Union
2      M1E  Scarborough                Guildwood,Morningside,West Hill
3      M1G  Scarborough                                         Woburn
4      M1H  Scarborough                                      Cedarbrae
5      M1J  Scarborough                            Scarborough Village
6      M1K  Scarborough      East Birchmount Park,Ionview,Kennedy Park
7      M1L  Scarborough                  Clairlea,Golden Mile,Oakridge
8      M1M  Scarborough  Cliffcrest,Cliffside,Scarborough Village West
9      M1N  Scarborough                     Birch Cliff,Cliffside West
(103, 3)



 Now we are going to retrieve the coordinates of the different neighbouroods.
 For that, we will use the CSV file, as importing the geocoder library failed

In [33]:

fileName = 'Geospatial_coordinates.csv'
coordinatesFile = pd.read_csv(fileName)
coordinatesFile.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [34]:
coordinatesFile.rename(columns={'Postal Code':'Postcode'}, inplace=True)
dfWithCoordinates = dfResult.merge(coordinatesFile, on='Postcode')
dfWithCoordinates.head()


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
