# Applied Data Science Capstone Assignment 2 :Segmenting and Clustering Neighborhoods in Toronto


In [37]:
# Import necessary libraries

import requests
import lxml.html as lh
import pandas as pd

In [38]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#Create a handle, page, to handle the contents of the website
page = requests.get(url)

#Store the contents of the website under doc
doc = lh.fromstring(page.content)

#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [39]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

#### This means that there are 3 columns per row

In [40]:
# Parse the first row as our header
tr_elements = doc.xpath('//tr')

#Create empty list
col=[]
i=0

#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print ('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Postal Code
"
2:"Borough
"
3:"Neighborhood
"


#### Creating Pandas DataFrame
### Each header is appended to a tuple along with an empty list.

In [41]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [42]:
# Check the length of each column. Ideally, they should all be the same
[len(C) for (title,C) in col]

[181, 181, 181]

##### This shows that each of the 3 columns has exactly 181 rows

### Creating the pandas data frame

In [43]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [44]:
# Access the top 5 rows of the data frame 
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


In [30]:
df.columns = ['Borough', 'Neighbourhood','Postcode']

cols = df.columns.tolist()
cols

cols = cols[-1:] + cols[:-1]

df = df[cols]

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,Not assigned\n,M1A\n,Not assigned\n
1,Not assigned\n,M2A\n,Not assigned\n
2,Parkwoods\n,M3A\n,North York\n
3,Victoria Village\n,M4A\n,North York\n
4,"Regent Park, Harbourfront\n",M5A\n,Downtown Toronto\n


In [56]:
df = df.replace('\n',' ', regex=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [57]:
cols[-1:]

['Neighbourhood']

In [58]:
cols[:-1]

['Postcode', 'Borough']

### Cleaning the messy string in the Borough column

In [59]:
df = df.replace('\n',' ', regex=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Dropping all cells with a borough that is Not assigned

In [62]:
df.rename(columns = {'Postal Code\n':'Postal Code','Borough\n':'Borough','Neighborhood\n':'Neighbourhood'}, inplace = True) 

In [63]:
df.columns

Index(['Postal Code', 'Borough', 'Neighbourhood'], dtype='object')

In [65]:
df['Borough'].unique

<bound method Series.unique of 0               Not assigned 
1               Not assigned 
2                 North York 
3                 North York 
4           Downtown Toronto 
5                 North York 
6           Downtown Toronto 
7               Not assigned 
8                  Etobicoke 
9                Scarborough 
10              Not assigned 
11                North York 
12                 East York 
13          Downtown Toronto 
14                North York 
15              Not assigned 
16              Not assigned 
17                 Etobicoke 
18               Scarborough 
19              Not assigned 
20                North York 
21                 East York 
22          Downtown Toronto 
23                      York 
24              Not assigned 
25              Not assigned 
26                 Etobicoke 
27               Scarborough 
28              Not assigned 
29              Not assigned 
                ...          
151                Etobicoke 
152      

In [66]:
df.drop(df.index[df["Borough"] == 'Not assigned '], inplace = True)

# Reset the index and dropping the previous index
df = df.reset_index(drop=True)

df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [70]:
df['Neighbourhood'].unique()

array([' ', 'Malvern, Rouge ', 'Rouge Hill, Port Union, Highland Creek ',
       'Guildwood, Morningside, West Hill ', 'Woburn ', 'Cedarbrae ',
       'Scarborough Village ',
       'Kennedy Park, Ionview, East Birchmount Park ',
       'Golden Mile, Clairlea, Oakridge ',
       'Cliffside, Cliffcrest, Scarborough Village West ',
       'Birch Cliff, Cliffside West ',
       'Dorset Park, Wexford Heights, Scarborough Town Centre ',
       'Wexford, Maryvale ', 'Agincourt ',
       "Clarks Corners, Tam O'Shanter, Sullivan ",
       "Milliken, Agincourt North, Steeles East, L'Amoreaux East ",
       "Steeles West, L'Amoreaux West ", 'Upper Rouge ',
       'Hillcrest Village ', 'Fairview, Henry Farm, Oriole ',
       'Bayview Village ', 'York Mills, Silver Hills ',
       'Willowdale, Newtonbrook ', 'Willowdale, Willowdale East ',
       'York Mills West ', 'Willowdale, Willowdale West ', 'Parkwoods ',
       'Don Mills ', 'Bathurst Manor, Wilson Heights, Downsview North ',
       'Northw

#### Combining Neighbourhoods based on similar Postcode and Borough

In [69]:
df = df.groupby(['Postal Code', 'Borough'])['Neighbourhood'].apply(','.join).reset_index()
df.columns = ['Postal Code','Borough','Neighbourhood']
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,,Canadian postal codes,
1,M1B,Scarborough,"Malvern, Rouge"
2,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
6,M1J,Scarborough,Scarborough Village
7,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
8,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
9,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"


In [86]:
df.drop(df.index[df['Borough'] == 'Canadian postal codes'], inplace = True)

#### Removing any space in the start of the string

In [87]:
df['Neighbourhood'] = df['Neighbourhood'].str.strip()

In [88]:
df.sample(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood
98,M9M,North York,"Humberlea, Emery"
86,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
61,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange"
67,M5S,Downtown Toronto,"University of Toronto, Harbord"
89,M8V,Etobicoke,"New Toronto, Mimico South, Humber Bay Shores"


In [89]:
df[df['Borough'] == 'Queen\'s Park']

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [90]:
df['Neighbourhood'].unique

<bound method Series.unique of 0                                                       
1                                         Malvern, Rouge
2                 Rouge Hill, Port Union, Highland Creek
3                      Guildwood, Morningside, West Hill
4                                                 Woburn
5                                              Cedarbrae
6                                    Scarborough Village
7            Kennedy Park, Ionview, East Birchmount Park
8                        Golden Mile, Clairlea, Oakridge
9        Cliffside, Cliffcrest, Scarborough Village West
10                           Birch Cliff, Cliffside West
11     Dorset Park, Wexford Heights, Scarborough Town...
12                                     Wexford, Maryvale
13                                             Agincourt
14               Clarks Corners, Tam O'Shanter, Sullivan
15     Milliken, Agincourt North, Steeles East, L'Amo...
16                         Steeles West, L'Amoreaux West


In [91]:
df_neigh = df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood']

In [92]:
df_neigh

Series([], Name: Neighbourhood, dtype: object)

In [93]:
df_neigh = df.loc[df['Neighbourhood'] == 'Not assigned']

In [94]:
df_neigh

Unnamed: 0,Postal Code,Borough,Neighbourhood


#### Assigning Borough values to the Neignbourhood where vlaue is "Not assigned"

In [95]:
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df['Borough']

In [96]:
# Check if the Neighbourhood for Queen's Park changed 
df[df['Borough'] == 'Queen\'s Park']

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [97]:
# Check the shape of the data frame
df.shape

(104, 3)