# Segmenting and Clustering Neighborhoods in Toronto

### Import Libraries and scraping the table with a BeautifulSoup object

In [4]:
import bs4 as bs
import urllib.request
import pandas as pd

source = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = bs.BeautifulSoup(source,'lxml')

table = soup.find('table', attrs={'class':'wikitable'})
table_rows = table.find_all('tr')

### Transform the table in a DataFrame

In [10]:
l = []
for i,tr in enumerate(table_rows):
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td]
    l.append(row)
df=pd.DataFrame(l[1:], columns=["Postal code", "Borough", 'Neighborhood'])

### Filter the Not Assigned Borough

In [11]:
df=df[df['Borough']!="Not assigned"]

In [12]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [20]:
df.rename(columns={'Postal code':'PostalCode'}, inplace=True)

### Separate the Neighborhood with comma

In [13]:
df['Neighborhood']=df['Neighborhood'].apply(lambda x: x.replace('/',','))

### Check if empty or Not Assigned data is present (in different ways...)

In [14]:
df['Borough'].isnull().value_counts()

False    103
Name: Borough, dtype: int64

In [15]:
'Not assigned' in df['Neighborhood'].values or '' in df['Neighborhood'].values

False

### DataFrame shape

In [16]:
df.shape

(103, 3)

### Get latitude and Longitude based on Postal Code

In [31]:
import geocoder # import geocoder
geocoder.google('{}, Toronto, Ontario'.format('M3A'))

<[REQUEST_DENIED] Google - Geocode [empty]>

geocoder does not work so I will go to the csv table

In [32]:
geoc=pd.DataFrame()

In [33]:
geoc=pd.read_csv('http://cocl.us/Geospatial_data')

In [34]:
geoc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [35]:
geoc.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

In [36]:
df2=pd.merge(geoc,df,on='PostalCode',how='inner')

In [37]:
df2.head()

Unnamed: 0,PostalCode,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Malvern , Rouge"
1,M1C,43.784535,-79.160497,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


In [38]:
df2.shape

(103, 5)