## Segmenting Neighbourhoods in Toronto

#### Installing and Importing Libraries
#### BeautifulSoup4 and lxml for rendering the html data into readable table
#### pandas for manipulating Dataframe

In [18]:
!pip install lxml
!pip install bs4
import lxml as lx
import pandas as pd
import requests
from bs4 import BeautifulSoup



In [19]:
#Getting the data
website_text = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_text,'xml')
table = soup.find('table',{'class':'wikitable sortable'})
A=[]
B=[]
C=[]
#Creating the dataframe
for row in table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True).strip())
        B.append(cells[1].find(text=True).strip())
        C.append(cells[2].find(text=True).strip())
df=pd.DataFrame(A,columns=['Postal Code'])
df['Borough']=B
df['Neighborhood']=C
#Removing the Not Assigned Boroughs
df = df[df['Borough'] != 'Not assigned']
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [20]:
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [21]:
#Showing the specific postal codes shown in the instructions
df[df['Postal Code'].isin(['M5G','M2H','M4B','M1J','M4M','M1R','M9V','M9L','M5V','M1B','M5A','M4G'])]

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M1B,Scarborough,"Malvern, Rouge"
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
23,M4G,East York,Leaside
24,M5G,Downtown Toronto,Central Bay Street
27,M2H,North York,Hillcrest Village
32,M1J,Scarborough,Scarborough Village
50,M9L,North York,Humber Summit
54,M4M,East Toronto,Studio District
71,M1R,Scarborough,"Wexford, Maryvale"


In [22]:
df.shape

(103, 3)

In [23]:
#Reading the Coordinates file csv
df_geo = pd.read_csv('Geospatial_Coordinates.csv')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [24]:
df_geo.shape

(103, 3)

In [25]:
#Merging the two files
df1 = df.merge(df_geo, left_on='Postal Code', right_on='Postal Code')
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [26]:
#Showing the specific postal codes shown in the instructions
df1[df1['Postal Code'].isin(['M5G','M2H','M4B','M1J','M4M','M1R','M9V','M9L','M5V','M1B','M5A','M4G'])]

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
23,M4G,East York,Leaside,43.70906,-79.363452
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
27,M2H,North York,Hillcrest Village,43.803762,-79.363452
32,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
50,M9L,North York,Humber Summit,43.756303,-79.565963
54,M4M,East Toronto,Studio District,43.659526,-79.340923
71,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849


In [27]:
print('Number of Unique Boroughs',len(df1['Borough'].unique()))
print('Number of Unique Neighborhoods',len(df1['Neighborhood'].unique()))
print('Shape of Merged DataFrame',df1.shape)

Number of Unique Boroughs 10
Number of Unique Neighborhoods 99
Shape of Merged DataFrame (103, 5)
