# Segmenting and Clustering Neighborhoods in Toronto

## Gathering Neighborhoods data of Toronto

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [2]:
site_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
def make_soup(link):
    url = link
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'lxml')
    return soup

In [4]:
soup = make_soup(site_link)

In [5]:
location_table_data = soup.find('table',{'class':'wikitable sortable'})

In [6]:
postcode=[]
borough=[]
neighborhood=[]
for i in range(0,len(location_table_data.find_all('td'))):
    if i%3==0:
        postcode.append(location_table_data.find_all('td')[i].text.strip())
    if i%3==1:
        borough.append(location_table_data.find_all('td')[i].text.strip())
    if i%3==2:
        neighborhood.append(location_table_data.find_all('td')[i].text.strip())

In [7]:
neighborhood_data = pd.DataFrame(data={'Postal Code':postcode})
neighborhood_data['Borough'] = borough
neighborhood_data['Neighborhood'] = neighborhood

## Data Pre-processing

In [8]:
#Ignore cells with a borough that is Not assigned
neighborhood_data = neighborhood_data[neighborhood_data.Borough != 'Not assigned']
sorted_neighborhood = neighborhood_data.sort_values(['Postal Code'])

In [9]:
sorted_neighborhood.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
28,M1C,Scarborough,Port Union
27,M1C,Scarborough,Rouge Hill
26,M1C,Scarborough,Highland Creek


In [10]:
no_duplicate=sorted_neighborhood.iloc[:,:2].drop_duplicates()

In [11]:
neighborhood_list_data = dict(sorted_neighborhood.groupby('Postal Code').Neighborhood.apply(list))

In [12]:
no_duplicate['Neighborhood']=list(neighborhood_list_data.values())

In [13]:
no_duplicate=no_duplicate.reset_index(drop=True)

In [14]:
#Check wether 'Not assigned' is in column 'Neighborhood'
for index, i in enumerate(no_duplicate['Neighborhood']):
    if 'Not assigned' in i:
        print(index, 'need to data preprocessing')

In [15]:
no_duplicate.to_csv('Neighborhood_data.csv',index=False)

In [16]:
no_duplicate.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Port Union, Rouge Hill, Highland Creek]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]
5,M1J,Scarborough,[Scarborough Village]
6,M1K,Scarborough,"[East Birchmount Park, Ionview, Kennedy Park]"
7,M1L,Scarborough,"[Golden Mile, Oakridge, Clairlea]"
8,M1M,Scarborough,"[Cliffcrest, Scarborough Village West, Cliffside]"
9,M1N,Scarborough,"[Cliffside West, Birch Cliff]"


In [17]:
no_duplicate.shape

(103, 3)