# Segmenting and Clustering Neighborhoods in Toronto

##### Install the necessary modules

In [2]:
# we need an extra module to read the wiki page
!pip install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 6.9MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0


##### Read the Wiki Page of Postal Codes

In [3]:
# import the necessary libraries
import pandas as pd
import lxml
# read the Wikipedia page
l_toronto = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
# lets see what we got: - its a list of dataframes
print(l_toronto)

[    Postal code           Borough  \
0           M1A      Not assigned   
1           M2A      Not assigned   
2           M3A        North York   
3           M4A        North York   
4           M5A  Downtown Toronto   
..          ...               ...   
175         M5Z      Not assigned   
176         M6Z      Not assigned   
177         M7Z      Not assigned   
178         M8Z         Etobicoke   
179         M9Z      Not assigned   

                                          Neighborhood  
0                                                  NaN  
1                                                  NaN  
2                                            Parkwoods  
3                                     Victoria Village  
4                           Regent Park / Harbourfront  
..                                                 ...  
175                                                NaN  
176                                                NaN  
177                                      

In [4]:
# Convert the list into one dataframe for Toronto Neiborhoods (TN)
df_TN = pd.concat(l_toronto)
# lets see what we got:
df_TN.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,Borough,Neighborhood,Postal code
0,,,,,,,,,,,...,,,,,,,,Not assigned,,M1A
1,,,,,,,,,,,...,,,,,,,,Not assigned,,M2A
2,,,,,,,,,,,...,,,,,,,,North York,Parkwoods,M3A
3,,,,,,,,,,,...,,,,,,,,North York,Victoria Village,M4A
4,,,,,,,,,,,...,,,,,,,,Downtown Toronto,Regent Park / Harbourfront,M5A


## Lets clean up the data we retrieved

##### Remove unnecessary columns

In [5]:
# Way to much colums, lets lose the ones we dont need.
df_TN_1 = df_TN.drop(columns=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
# lets see what we got:
df_TN_1.head(10)

Unnamed: 0,Borough,Neighborhood,Postal code
0,Not assigned,,M1A
1,Not assigned,,M2A
2,North York,Parkwoods,M3A
3,North York,Victoria Village,M4A
4,Downtown Toronto,Regent Park / Harbourfront,M5A
5,North York,Lawrence Manor / Lawrence Heights,M6A
6,Downtown Toronto,Queen's Park / Ontario Provincial Government,M7A
7,Not assigned,,M8A
8,Etobicoke,Islington Avenue,M9A
9,Scarborough,Malvern / Rouge,M1B


In [6]:
# lets find out some more 
df_TN_1.shape

(186, 3)

##### Remove surplus rows we don't need

In [8]:
# on the wiki page there are 180 rows, lets find the extra rows
df_TN_1.tail(10)

Unnamed: 0,Borough,Neighborhood,Postal code
176,Not assigned,,M6Z
177,Not assigned,,M7Z
178,Etobicoke,Mimico NW / The Queensway West / South of Bloo...,M8Z
179,Not assigned,,M9Z
0,,,
1,,,
2,,,
3,,,
0,,,
1,,,


In [9]:
# They are filled with "NaN" so we can drop them
df_TN_2 = df_TN_1.dropna(how='all')
df_TN_2.tail(10)

Unnamed: 0,Borough,Neighborhood,Postal code
170,Not assigned,,M9Y
171,Not assigned,,M1Z
172,Not assigned,,M2Z
173,Not assigned,,M3Z
174,Not assigned,,M4Z
175,Not assigned,,M5Z
176,Not assigned,,M6Z
177,Not assigned,,M7Z
178,Etobicoke,Mimico NW / The Queensway West / South of Bloo...,M8Z
179,Not assigned,,M9Z


In [11]:
# lets see what it looks like 
df_TN_2.shape

(180, 3)

##### Lets format the DataFrame properly

In [12]:
# let put the columns in the right order, and look at te information of the dataframe
df_TN_3 = df_TN_2[['Postal code','Borough','Neighborhood']]
df_TN_3.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 180 entries, 0 to 179
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Postal code   180 non-null    object
 1   Borough       180 non-null    object
 2   Neighborhood  103 non-null    object
dtypes: object(3)
memory usage: 5.6+ KB


In [15]:
#Lets name the Postal code column correctly
df_TN_3.rename(columns={'Postal code' :'PostalCode'}, inplace=True)
df_TN_3.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


## We have some null values in neighborhood, but postal code and borough 

##     are all filled so let check for all the other requirements first

1. Ignore cells with a borough that is Not assigned.
2. More than one neighborhood can exist in one postal code area.
3. If a borough has a Not assigned neighborhood, then the neighborhood will be the same as the borough.
4. Combine the neighborhoods into one row with the neighborhoods separated with a comma.

### 1. Ignore cells with a borough that is Not assigned.

In [16]:
# 1 Lets clean up the set some more: Get names of indexes of Boroughs that are 'Not Assigned'
indexNames = df_TN_3[df_TN_3['Borough']=='Not assigned'].index
# Delete these row indexes from dataFrame
df_TN_4 = df_TN_3.drop(indexNames)
# Lets see what is left
df_TN_4.shape

(103, 3)

In [17]:
#  lets take a peek
df_TN_4.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


##### All rows where Borough is 'Not assigned' were removed

### 2. More than one neighborhood can exist in one postal code area.

In [19]:
# Lets see how many postal codes have more then one Neigborhood
df_TN_4.groupby('Neighborhood').filter(lambda x: len(x) > 1)

Unnamed: 0,PostalCode,Borough,Neighborhood
11,M3B,North York,Don Mills
20,M3C,North York,Don Mills
65,M3K,North York,Downsview
74,M3L,North York,Downsview
83,M3M,North York,Downsview
91,M2N,North York,Willowdale
92,M3N,North York,Downsview
109,M2R,North York,Willowdale


##### Where there is more then one neighborhood, they are not in the same postal code area

### 3. If a borough has a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [20]:
# lets find the boroughs without a neighborhood - Get a True/False series representing which row satisfies the condition 
seriesObj = df_TN_4.apply(lambda x: True if x['Neighborhood'] == 'Not assigned' else False , axis=1)
# Count number of True in series
numOfRows = len(seriesObj[seriesObj == True].index)
 
print('Number of Rows where Neighborhood == Not assigned : ', numOfRows)

Number of Rows where Neighborhood == Not assigned :  0


##### There are no Boroughs where the neighborhood is 'Not assigned'

### 4. Combine the neighborhoods into one row with the neighborhoods separated with a comma.

In [22]:
# Because there are no neighborhoods to combine (see point 2) lets replace the forward slash with a comma
df_TN_5 = df_TN_4.replace(to_replace=' / ',value=', ',regex=True )
df_TN_5.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


##### Forward slashes are replaced, looks like we're ready!

In [23]:
df_TN_5.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 2 to 178
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   PostalCode    103 non-null    object
 1   Borough       103 non-null    object
 2   Neighborhood  103 non-null    object
dtypes: object(3)
memory usage: 3.2+ KB


##### All entries are filled. All is well, next and last cell is the shape method

In [24]:
df_TN_5.shape

(103, 3)