### _Segmenting and Clustering Neighborhoods in Toronto Part 1 - Web Scraping using BeautifulSoup_ ###

#### Import required libraries ####

In [1]:
#Import required libraries

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
print('Libraries Imported Successfully!')

Libraries Imported Successfully!


#### Download url and construct a dataframe ####

In [2]:
#Download web page
url  = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)

# Construct a dataframe
df = pd.read_html(url, header=0)[0]

print(df.head(10))

  Postcode           Borough     Neighbourhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront
5      M5A  Downtown Toronto       Regent Park
6      M6A        North York  Lawrence Heights
7      M6A        North York    Lawrence Manor
8      M7A      Queen's Park      Not assigned
9      M8A      Not assigned      Not assigned


#### Remove Boroughs that are not assigned ####

In [3]:
# Remove  Bouroughs that are 'not assigned'
df = df[df.Borough != 'Not assigned']
print(df.head(10))
df.shape

   Postcode           Borough     Neighbourhood
2       M3A        North York         Parkwoods
3       M4A        North York  Victoria Village
4       M5A  Downtown Toronto      Harbourfront
5       M5A  Downtown Toronto       Regent Park
6       M6A        North York  Lawrence Heights
7       M6A        North York    Lawrence Manor
8       M7A      Queen's Park      Not assigned
10      M9A         Etobicoke  Islington Avenue
11      M1B       Scarborough             Rouge
12      M1B       Scarborough           Malvern


(212, 3)

#### 'Not assigned' Neighborhoods replaced with Borough value ####

In [4]:
#Replace 'not assigned' with a Borough value
for index, row in df.iterrows():
    if df.loc[index,'Neighbourhood'] == 'Not assigned':
        df.loc[index,'Neighbourhood']=df.loc[index,'Borough']
print(df.head(10))

   Postcode           Borough     Neighbourhood
2       M3A        North York         Parkwoods
3       M4A        North York  Victoria Village
4       M5A  Downtown Toronto      Harbourfront
5       M5A  Downtown Toronto       Regent Park
6       M6A        North York  Lawrence Heights
7       M6A        North York    Lawrence Manor
8       M7A      Queen's Park      Queen's Park
10      M9A         Etobicoke  Islington Avenue
11      M1B       Scarborough             Rouge
12      M1B       Scarborough           Malvern


#### Group Neighborhoods via Postcode ####

In [5]:
df=df.groupby('Postcode', as_index=False).agg(lambda x: ', '.join(set(x.dropna())))
print(df.head(10))

  Postcode      Borough                                    Neighbourhood
0      M1B  Scarborough                                   Malvern, Rouge
1      M1C  Scarborough           Highland Creek, Rouge Hill, Port Union
2      M1E  Scarborough                Morningside, Guildwood, West Hill
3      M1G  Scarborough                                           Woburn
4      M1H  Scarborough                                        Cedarbrae
5      M1J  Scarborough                              Scarborough Village
6      M1K  Scarborough      Ionview, Kennedy Park, East Birchmount Park
7      M1L  Scarborough                  Golden Mile, Clairlea, Oakridge
8      M1M  Scarborough  Cliffside, Scarborough Village West, Cliffcrest
9      M1N  Scarborough                      Cliffside West, Birch Cliff


#### Change 'Neighbourhood' to 'Neighborhood' ####

In [6]:
df = df.rename(columns={df.columns[2]: "Neighborhood" })
print(df.head())

  Postcode      Borough                            Neighborhood
0      M1B  Scarborough                          Malvern, Rouge
1      M1C  Scarborough  Highland Creek, Rouge Hill, Port Union
2      M1E  Scarborough       Morningside, Guildwood, West Hill
3      M1G  Scarborough                                  Woburn
4      M1H  Scarborough                               Cedarbrae


#### Shape of Dataframe ####

In [7]:
print('The number of Postal codes = {}.'.format(df.shape[0]))

The number of Postal codes = 103.
