# Segmenting and Clustering Neighborhoods in Toronto

In [294]:
import requests
import pandas as pd
import numpy as np
import seaborn as sns
pd.set_option('display.max_columns', None)

<font color='Blue'>Assign the link of the website to a variable named 'url'</font>

In [418]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

<font color='Blue'>Read the source code and create a BeautifulSoup object </font>

In [419]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(url,'lxml')
#print(soup.prettify())

<font color='Blue'>All the table contents which we intend to extract is under class Wikitable Sortable </font>

<font color='green'>Then extract all the rows within 'tr' </font>

In [420]:
My_table = soup.find('table',{'class':'wikitable sortable'})
rows=My_table.findAll('tr')
#rows

<font color='Blue'>Define an empty data frame and append the information within 'td' row by row </font>

In [421]:
Mydata = []
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    Mydata.append([ele for ele in cols if ele])

In [422]:
pd.DataFrame(Mydata).head()

Unnamed: 0,0,1,2
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


<font color='Blue'> Rename columns and drop the first row </font>

In [405]:
WikiData=pd.DataFrame(Mydata, columns=['PostalCode', 'Borough', 'Neighborhood'])
WikiData.drop(0, inplace=True)
print(WikiData.shape)
WikiData.head()

(287, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


<font color='Blue'>Ignore cells with a borough that is Not assigned</font>

<font color='green'>If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough</font>

In [407]:
WikiData.drop(WikiData[WikiData['Borough'] == 'Not assigned'].index, inplace = True) 
# 
WikiData['Neighborhood'].loc[(WikiData['Borough'] != 'Not assigned') & (WikiData['Neighborhood'] == 'Not assigned')]=WikiData['Borough']
print(WikiData.shape)
WikiData.head()

(210, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor


<font color='Blue'>Make the wide format of the data, ignoring 'Borough' for now and Replace the 'NaN' values with " "</font>

In [409]:
# 
WikiData2=WikiData.pivot(index='PostalCode', columns='Neighborhood', values='Neighborhood')
# 
WikiData2.fillna('', inplace=True)
WikiData2.reset_index(inplace=True)
#WikiData2.tail()

<font color='Blue'>Concat columns row by row with a ',' in between</font>
<font color='Blue'>remove the last ', ' in the column</font>

In [412]:
# 
WikiData2['Neighborhoods']=''
for row in range(0,103):
    for index in range(1,208):
        if WikiData2.iloc[row,index] != '':
            temp=WikiData2.iloc[row,index]
            WikiData2['Neighborhoods'][row]= WikiData2['Neighborhoods'][row]+temp+', '
WikiData3=WikiData2[['PostalCode','Neighborhoods']].copy()
WikiData3['Neighborhoods'] = WikiData3['Neighborhoods'].map(lambda x: str(x)[:-2])
# Add the Borough column
WikiData3.head()

Neighborhood,PostalCode,Neighborhoods
0,M1B,"Malvern, Rouge"
1,M1C,"Highland Creek, Port Union, Rouge Hill"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae


<font color='Blue'>Merge with original Data to add Borough and then remove duplicates </font>

In [424]:
WikiData4=pd.merge(WikiData3, WikiData[['PostalCode','Borough']], left_on=['PostalCode'],
                   right_on=['PostalCode'],  how='inner')
WikiData4.drop_duplicates(inplace=True)
WikiData4.head()

Unnamed: 0,PostalCode,Neighborhoods,Borough
0,M1B,"Malvern, Rouge",Scarborough
2,M1C,"Highland Creek, Port Union, Rouge Hill",Scarborough
5,M1E,"Guildwood, Morningside, West Hill",Scarborough
8,M1G,Woburn,Scarborough
9,M1H,Cedarbrae,Scarborough


<font color='Blue'> Clean the Dataframe</font>

In [425]:
#
WikiData4.insert(3, 'Neighborhood', WikiData4['Neighborhoods'].values)
WikiData4.drop(columns=['Neighborhoods'], inplace=True)
WikiData2.reset_index(inplace=True)
pd.set_option('display.max_rows', None)
WikiData4.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
2,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
5,M1E,Scarborough,"Guildwood, Morningside, West Hill"
8,M1G,Scarborough,Woburn
9,M1H,Scarborough,Cedarbrae


In [426]:
WikiData4.shape

(103, 3)