# Segmenting and Clustering Neighborhoods in Toronto
Explore, segment, and cluster the neighborhoods in the city of Toronto

In [None]:
!conda install -c conda-forge geopy --yes

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: | 

In [12]:
import requests
import bs4
import pandas as pd
import numpy as np
import geocoder

ModuleNotFoundError: No module named 'geocoder'

## Scrapping the table with the data

We use the beautifulSoup library to sracp the wikipedia page.

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')

Get the table with all the data about postal codes and then all the rows composing this table.

In [3]:
table = soup.find(name='table')

In [4]:
rows = table.findAll(lambda tag: tag.name=='tr')

Name of the columns

In [5]:
name_columns = []
columns = rows[0].findAll(lambda tag: tag.name=='th')
for column in columns:
    name_columns.append(column.get_text().strip('\n'))

Get all the data and put it in a dataframe.

In [6]:
postal_codes = np.array([name_columns])
for row in rows[1:]:
    elements = row.findAll(lambda tag: tag.name=='td')
    list_elements = []
    for element in elements:
        list_elements.append(element.get_text().strip('\n'))
    postal_codes = np.vstack((postal_codes, list_elements))
    
df_postal_codes = pd.DataFrame(postal_codes[1:], columns=postal_codes[0])
df_postal_codes

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Clean the dataframe

Only process the cells that have an assigned borough :

In [7]:
df_postal_codes = df_postal_codes[~df_postal_codes.Borough.str.contains("Not assigned")].reset_index(drop=True)

All postal codes are unique so we dont have to combined them :

In [8]:
df_postal_codes["Postal Code"].value_counts().sort_values()

M5E    1
M2H    1
M1V    1
M6P    1
M3N    1
      ..
M5P    1
M1K    1
M4H    1
M4G    1
M5V    1
Name: Postal Code, Length: 103, dtype: int64

There is no cell which has a borough but a Not assigned neighborhood :

In [9]:
df_postal_codes[df_postal_codes.Neighbourhood.str.contains("Not assigned")]

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [10]:
df_postal_codes

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [11]:
df_postal_codes.shape

(103, 3)

## Coordinates

In [None]:
from geopy.geocoders import Nominatim

In [17]:
geolocator = Nominatim(user_agent="toronto_explorer")
longitude, latitude = [], []

for postal_code in df_postal_codes['Postal Code']:
    location = geolocator.geocode('{}, Toronto, Ontario'.format(postal_code))
    longitude.append(location.longitude)
    latitude.append(location.latitude)
    
df_postal_codes['Latitude'] = latitude
df_postal_codes['Longitude'] = longitude
df_postal_codes

M3A
M4A
M5A
M6A
M7A
M9A
M1B
M3B
M4B
M5B
M6B
M9B
M1C
M3C
M4C
M5C
M6C
M9C
M1E
M4E
M5E
M6E
M1G
M4G
M5G
M6G
M1H
M2H
M3H
M4H
M5H
M6H
M1J
M2J
M3J
M4J
M5J
M6J
M1K
M2K
M3K
M4K
M5K
M6K
M1L
M2L
M3L
M4L
M5L
M6L
M9L
M1M
M2M
M3M
M4M
M5M
M6M
M9M
M1N
M2N
M3N
M4N
M5N
M6N
M9N
M1P
M2P
M4P
M5P
M6P
M9P
M1R
M2R
M4R
M5R
M6R
M7R
M9R
M1S
M4S
M5S
M6S
M1T
M4T
M5T
M1V
M4V
M5V
M8V
M9V
M1W
M4W
M5W
M8W
M9W
M1X
M4X
M5X
M8X
M4Y
M7Y
M8Y
M8Z


In [None]:
 # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]