# Exploring and clustering the neighborhoods in Toronto

In [1]:
#Importing the Library 
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
#Generating Beautiful soup Object
url ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
#Process and convert html data
data = response.text
soup = BeautifulSoup(data,'html.parser')

## develop the dataframe

In [3]:
#Scraping the webpage from the wiki
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


## Cleanup dataframe

In [4]:
#Ensuring there are no cells with not assigned 
print(df.loc[df['Neighborhood'] == 'Not assigned'])
print(df.loc[df['Borough'] == 'Not assigned'])

Empty DataFrame
Columns: [PostalCode, Borough, Neighborhood]
Index: []
Empty DataFrame
Columns: [PostalCode, Borough, Neighborhood]
Index: []


In [5]:
#Finetune Borough
df['Borough'].unique()

array(['North York', 'Downtown Toronto', "Queen's Park", 'Etobicoke',
       'Scarborough', 'East York', 'York', 'East Toronto', 'West Toronto',
       'East YorkEast Toronto', 'Central Toronto',
       'MississaugaCanada Post Gateway Processing Centre',
       'Downtown TorontoStn A PO Boxes25 The Esplanade',
       'EtobicokeNorthwest',
       'East TorontoBusiness reply mail Processing Centre969 Eastern'],
      dtype=object)

In [6]:
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [7]:
#Ensuring that more than one row is not assigned to a a postal code
print(df.loc[df['PostalCode'] == 'M5A'])
df["PostalCode"].value_counts()

  PostalCode           Borough               Neighborhood
2        M5A  Downtown Toronto  Regent Park, Harbourfront


M5C    1
M5L    1
M2P    1
M2H    1
M4J    1
      ..
M1W    1
M5J    1
M3C    1
M4W    1
M1X    1
Name: PostalCode, Length: 103, dtype: int64

In [8]:
df.shape

(103, 3)

## Saving the dataframe

In [9]:
#Save data as'Capstone_part1.csv'
df.to_csv('Capstone_part1.csv')
print('Successfully Saved!')

Successfully Saved!
