# Segmenting and Clustering Neighborhoods in Toronto

In [11]:
#installing requirements
!pip install lxml
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
Collecting soupsieve>1.2; python_version >= "3.0"
  Downloading soupsieve-2.0.1-py3-none-any.whl (32 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py): started
  Building wheel for bs4 (setup.py): finished with status 'done'
  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1277 sha256=102f9f42fe162bc38f11b7080a37bbfa8140b1ecbecd32d75e2e2f41f794b394
  Stored in directory: c:\users\tummi\appdata\local\pip\cache\wheels\0a\9e\ba\20e5bbc1afef3a491f0b3bb74d508f99403aabe76eda2167ca
Successfully built bs4
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.9.3 bs4-0.0.1 soupsieve-2.0.1


### Getting data from wikipedia using BeautifulSoup


In [113]:
#importing libraries
import pandas as pd
import lxml
import requests
from bs4 import BeautifulSoup

In [118]:
url ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

source = requests.get(url)

soup = BeautifulSoup(source.content)

table = soup.find('table',{'class':'wikitable sortable'})
rows = table.find_all('tr')

data = []
for row in rows:
    cols = row.find_all('td')
    cols = [l.text.strip() for l in cols]
    data.append([l for l in cols if l])
    
df = pd.DataFrame(data)

df.columns= ['PostalCode', 'Borough', 'Neighborhood']

df.shape

(181, 3)

### Data preparation

In [119]:
df.drop(index=[0], inplace=True)

for i in list(df.index):
    if df.loc[i, 'Borough'] == 'Not assigned':
        df.drop(i, axis=0, inplace=True)
                
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)
df.groupby('PostalCode')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Shape of the DataFrame

In [120]:
df.shape

(103, 3)

### Merging with Latitude and longitud for each postal code

In [139]:
df2 = pd.read_csv('c:/Users/tummi/Desktop/Corsi e learning/Attuali/IBM DS professional certificate (in corso)/9. Applied Capstone Project/Geospatial_Coordinates.csv')
df2

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [147]:
merged_df = pd.merge(df, df2, how='inner', left_on=df['PostalCode'], right_on=df2['Postal Code'], validate='one_to_one')

merged_df.head()

merged_df.drop('Postal Code', axis=1, inplace=True)
merged_df.drop('key_0', axis=1, inplace=True)

merged_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [158]:
#quick example check
merged_df.loc[df['PostalCode'] == 'M5G']

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


### Toronto Borough Clustering

Creating a dataframe with grouped Borough by mean

In [162]:
grouped_df= merged_df.groupby('Borough').mean().reset_index()

grouped_df

Unnamed: 0,Borough,Latitude,Longitude
0,Central Toronto,43.70198,-79.398954
1,Downtown Toronto,43.654597,-79.383972
2,East Toronto,43.669436,-79.324654
3,East York,43.700303,-79.335851
4,Etobicoke,43.660043,-79.542074
5,Mississauga,43.636966,-79.615819
6,North York,43.750727,-79.429338
7,Scarborough,43.766229,-79.249085
8,West Toronto,43.652653,-79.44929
9,York,43.690797,-79.472633


running k-means clustering algorithm, dividing boroughs in 3 clusters

In [176]:
from sklearn.cluster import KMeans

grouped_df_clustering=grouped_df.drop('Borough', 1)

k = 3

kmeans = KMeans(n_clusters=k, random_state=0).fit(grouped_df_clustering)

kmeans.labels_

array([1, 0, 0, 0, 2, 2, 1, 1, 1, 1])

In [177]:
grouped_df['Cluster']=kmeans.labels_
grouped_df

Unnamed: 0,Borough,Latitude,Longitude,Cluster
0,Central Toronto,43.70198,-79.398954,1
1,Downtown Toronto,43.654597,-79.383972,0
2,East Toronto,43.669436,-79.324654,0
3,East York,43.700303,-79.335851,0
4,Etobicoke,43.660043,-79.542074,2
5,Mississauga,43.636966,-79.615819,2
6,North York,43.750727,-79.429338,1
7,Scarborough,43.766229,-79.249085,1
8,West Toronto,43.652653,-79.44929,1
9,York,43.690797,-79.472633,1


### Visualizing the clusters

In [185]:
#importing libraries
import geopy
import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim

#locate Toronto
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

creating the map to visualize clusters

In [186]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(grouped_df['Latitude'], grouped_df['Longitude'], grouped_df['Borough'], grouped_df['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters