# Segmenting and Clustering Neighborhoods in Toronto
+ Import libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

+ Get xml text from the link (https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M)

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

+ Get the list of postal code of Toronto and put into a dataframe 'df'. If the cells have an assigned borough. ignore cells with a borough that is Not assigned. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [3]:
table = []
for tr in soup.find_all('tr'):
    row = []
    for td in tr.find_all('td'):        
        row.append(td.text.strip())
    if len(row)==3 and row[1] != 'Not assigned':
        if row[2] == 'Not assigned':            
            row[2] = row[1]
        table.append(row)        
df = pd.DataFrame(table)
df.columns = ['PostalCode','Borough','Neighbourhood']
df.head()


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


+ Sort df by 'Code' and put into a new dataframe 'df1'.

In [4]:
df1 = df.sort_values(by=['PostalCode'])
df1.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern
23,M1C,Scarborough,Port Union
22,M1C,Scarborough,Rouge Hill
21,M1C,Scarborough,Highland Creek


+ If more than one neighborhood exist in one postal code area, these two rows will be combined into one row with the neighborhoods separated with a comma. Create a new dataframe 'df2'

In [5]:
current_code = ''
table=[]
t_index = 0;
for index, row in df1.iterrows():
    if row['PostalCode']!=current_code:
        current_code = row['PostalCode']
        table.append([row['PostalCode'], row['Borough'],row['Neighbourhood']])
        t_index = t_index + 1
    else:
        table[t_index-1][2] = table[t_index-1][2] + ', ' + row['Neighbourhood']
        
df2 = pd.DataFrame(table)
df2.columns = ['PostalCode','Borough','Neighbourhood']
print(df2.head())
print("Dataframe has the shape of " + str(df2.shape))

  PostalCode      Borough                           Neighbourhood
0        M1B  Scarborough                          Rouge, Malvern
1        M1C  Scarborough  Port Union, Rouge Hill, Highland Creek
2        M1E  Scarborough       Guildwood, Morningside, West Hill
3        M1G  Scarborough                                  Woburn
4        M1H  Scarborough                               Cedarbrae
Dataframe has the shape of (103, 3)


* Read latitude and longitude information for postal codes and create a new dataframe 'Geo_df'

In [6]:
Geo_df = pd.read_csv('http://cocl.us/Geospatial_data')
Geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


* Merge two dataframes: 'df2' and 'Geo_df' and create a new dataframe 'df_merge'

In [7]:
df_merge = df2.merge(Geo_df, left_on='PostalCode', right_on='Postal Code')
df_merge.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


* Filter Borough containing Toronto

In [8]:
df_toronto = df_merge[df_merge['Borough'].str.contains("Toronto")].reset_index()
df_toronto.head() 

Unnamed: 0,index,PostalCode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,37,M4E,East Toronto,The Beaches,M4E,43.676357,-79.293031
1,41,M4K,East Toronto,"The Danforth West, Riverdale",M4K,43.679557,-79.352188
2,42,M4L,East Toronto,"The Beaches West, India Bazaar",M4L,43.668999,-79.315572
3,43,M4M,East Toronto,Studio District,M4M,43.659526,-79.340923
4,44,M4N,Central Toronto,Lawrence Park,M4N,43.72802,-79.38879


* Cluster the postal code based on location and create a new column 'cluster'

In [9]:
from sklearn.cluster import KMeans
locations = df_toronto[['Latitude','Longitude']].values
kmeans = KMeans(n_clusters=4, random_state=0).fit(locations)
df_toronto['cluster'] = kmeans.labels_
df_toronto.head()

Unnamed: 0,index,PostalCode,Borough,Neighbourhood,Postal Code,Latitude,Longitude,cluster
0,37,M4E,East Toronto,The Beaches,M4E,43.676357,-79.293031,0
1,41,M4K,East Toronto,"The Danforth West, Riverdale",M4K,43.679557,-79.352188,0
2,42,M4L,East Toronto,"The Beaches West, India Bazaar",M4L,43.668999,-79.315572,0
3,43,M4M,East Toronto,Studio District,M4M,43.659526,-79.340923,0
4,44,M4N,Central Toronto,Lawrence Park,M4N,43.72802,-79.38879,2


* Get the Latitude and the Longitude for the center of the map and draw the map

In [10]:
Map_center = [(df_toronto.Latitude.max()+df_toronto.Latitude.min())/2,(df_toronto.Longitude.max()+df_toronto.Longitude.max())/2]

* Draw the map and label the cluster with colors

In [15]:
import folium
m = folium.Map(
    location=Map_center,
    zoom_start=12
)
color_list=['red','green','blue','orange']

for index, row in df_toronto.iterrows():    
    Latitude = row['Latitude']
    Longitude = row['Longitude']
    PostalCode = row['PostalCode'] + ': ' + row['Neighbourhood']
    color = color_list[row['cluster']]
    folium.Marker([Latitude, Longitude], popup= PostalCode, icon=folium.Icon(color=color)).add_to(m)
m

Thank you :)