# Toronto Neighborhoods

In [1]:
#  imports
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd


In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

page = requests.get(url)

soup = BeautifulSoup(page.content, 'html.parser')

table = soup.find_all('table')
df = pd.read_html(str(table))[0]
df.head()


Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [3]:
df.columns

Index(['Postal code', 'Borough', 'Neighborhood'], dtype='object')

In [4]:
# lets renamne the column ' Postal Code' as 'PostalCode' as needed by the assignment
df.rename(columns = {'Postal code':'PostalCode'}, inplace = True) 
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [5]:
# Find indexes of rows that have "Not assigned" in Borough column
indexes = df[(df['Borough'] == "Not assigned")].index

# Drop rows that have "Not assigned" in Borough column
df.drop(indexes,inplace=True)

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 2 to 178
Data columns (total 3 columns):
PostalCode      103 non-null object
Borough         103 non-null object
Neighborhood    103 non-null object
dtypes: object(3)
memory usage: 3.2+ KB


In [7]:
# Combine multiple rows into one row based on PostalCode and Borough,
# However this will not make any changes as Postal codes in the original data in the wiki page is already grouped and delimited by /.

df_grp=df.groupby(['PostalCode','Borough'])['Neighborhood'].agg(", ".join).reset_index()

df_grp.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
# change the delimiter sign in 'Neighborhood' colomn from '/' to ',' 

df_grp['Neighborhood']=df_grp['Neighborhood'].str.replace('/',',')
df_grp.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
# Shape of the Dataframe

df_grp.shape

(103, 3)

## Part 2:

In [10]:
# Read Geo Data from the csv file

url_geoData = "http://cocl.us/Geospatial_data"

df_geo_data = pd.read_csv(url_geoData )

df_geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
# lets renamne the column ' Postal Code' as 'PostalCode' as needed by the assignment
df_geo_data.rename(columns = {'Postal Code':'PostalCode'}, inplace = True) 
df_geo_data.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
# Lets merge the  2 dataframes ( df_grp, df_geo_data) to create one dataframe (df_main)

df_main = pd.merge(df_grp, df_geo_data, on="PostalCode", how='left')
df_main.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [13]:
df_main.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [14]:
path = r'F:\Coursera courses\IBM\9 - Applied Data Science Capstone\Week 3 - Neighborhood Segmentation and Clustering\2 - Assignment\Main_pd_df.csv'

df_main.to_csv(path,index=None,header=True)

In [15]:
df_rd = pd.read_csv(path)

In [16]:
df_rd.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
