# Segmenting and Clustering Neighborhoods in Toronto

## import the needed Library and read the zip code url and table

In [165]:
## import the needed Library
import pandas as pd
import numpy as np
import urllib
from bs4 import BeautifulSoup

torontoWikiUrl = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
torontoResponse = urllib.request.urlopen(torontoWikiUrl)
totontoData = torontoResponse.read().decode('utf-8')
zipCodeHtml= BeautifulSoup(totontoData, 'html.parser')
zipCodeTable = zipCodeHtml.find('table', class_='wikitable sortable')
zipCodeList=[]
for tr in zipCodeTable.find_all('tr'):
    trow = []
    for td in tr.find_all('td'):
        trow.append(td.text.strip('\n'))
    zipCodeList.append(trow)    

## The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [166]:
zipCodeDF = pd.DataFrame(zipCodeList, columns = ['PostalCode', 'Borough', 'Neighborhood'])
## remove the header html tag
zipCodeDF = zipCodeDF[1:]
zipCodeDF

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
10,M8A,Not assigned,Not assigned


## Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [167]:
zipCodeDF_borough.describe()

Unnamed: 0,PostalCode,Borough,Neighborhood
count,212,212,212
unique,103,11,210
top,M8Y,Etobicoke,Runnymede
freq,8,45,2


In [168]:
zipCodeDF_borough = zipCodeDF[zipCodeDF.Borough != 'Not assigned']  
zipCodeDF_borough

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


## ##If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [169]:
zipCodeDF_borough.is_copy=False 
zipCodeDF_borough['Neighborhood'] = np.where(zipCodeDF_borough['Neighborhood'].values == 'Not assigned', 
                                                    zipCodeDF_borough['Borough'].values,
                                                    zipCodeDF_borough['Neighborhood'].values
                                            )
zipCodeDF_borough

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


## More than one neighborhood can exist in one postal code area.Rows will be combined into one row with the neighborhoods separated with a comma

In [150]:
zipCodeDF_combined = zipCodeDF_borough.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
zipCodeDF_borough.append(zipCodeDF_combined.to_frame())
zipCodeDF_borough

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


## print out the final DataFrame : using.shape() print the number of rows of your dataframe

In [170]:
zipCodeDF_borough.shape[0]

212