# Segmenting and Clustering Neighborhoods in Toronto
### Javier Cholbi Doblado

In [1]:
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
from urllib.request import urlopen, Request

In [58]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
request = Request(url)
response = urlopen(request)
html = response.read()
df=pd.read_html(html, header=0)
df=df[0]
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


* Get the data in html format from wikipedia URL
* Put the html dataset into a dataframe

In [59]:
df=df.loc[df["Borough"]!='Not assigned']

In [60]:
df=df.replace('Not assigned',"Queen's Park")

* Filter the 'Not assigned' category from the initial dataframe
* Replace the Queen's Park problem

In [61]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [62]:
df.shape

(211, 3)

In [63]:
data = {'Postal Code': [],'Borough': [],'Neighbourhood': []}

for i in range(0,len(df)):
    
    data['Postal Code'].append(df.iloc[i,0])
    data['Borough'].append(df.iloc[i,1])
    data['Neighbourhood'].append(df.iloc[i,2])
           
df= pd.DataFrame(data)                       

* New dataframe to have the elements indexed from 0 to 210

In [64]:
df=df.groupby(['Postal Code', 'Borough'], as_index=False,sort=False).agg(','.join)

* Group by postcode and Borough and join with a commas...

In [73]:
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


In [66]:
df.shape

(103, 3)

In [67]:
path ='C:/Users/00022881/Downloads/'
name ='Geospatial_Coordinates.csv'
cor = pd.read_csv(path + name)
cor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


* Load the csv file with the Postal Cose Data

In [68]:
df_2=pd.merge(df, cor, on='Postal Code', how='inner')

* Merge both datasets using the merge function and inner mode

In [72]:
df_2.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937


In [71]:
df_2.shape

(103, 5)