# Segmenting and Clustering Neighborhoods in Toronto

importing request library to scrap the wikipedia page

In [36]:
import requests

website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url, 'lxml')

Of the extracted wikipedia page, only table of class "wikitable sortable" is relevant, so extracting it using beautiful soup

In [37]:
My_table = soup.find('table' ,{'class':'wikitable sortable'})

finding all tags for td and creating array of results

In [38]:
links = My_table.findAll('td')

extracting text from the tag "td"

In [39]:
Table_content = []
for link in links:
    Table_content.append(link.text)
    


In [40]:
Table_content = [w.replace('\n', '') for w in Table_content]

##  Usig the numpy array of the table so extracted, we create the relevant dataframe required

In [41]:
import numpy as np

Table_content = np.asarray(Table_content)

In [42]:
import pandas as pd
df = pd.DataFrame()

df['PostalCode'] = Table_content[::3]
df['Borough'] = Table_content[1::3]
df['Neighborhood'] = Table_content[2::3]

In [43]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Dropping un-assigned "Borough"

In [44]:
df = df.drop(df[df['Borough'] == 'Not assigned'].index)

In [45]:
df = df.reset_index(drop=True)

Grouping based on "PostalCode" and rest columns are appended using "," as seperator

In [46]:
df1 = df.astype(str).groupby('PostalCode').agg(','.join).reset_index()

## Handling the duplicate substring in "Borough"

In [47]:
import re

In [48]:
bor_array = df1['Borough'].values

In [49]:
result = []

In [50]:
for bor in bor_array:
    result.append(','.join(set(bor.split(','))))


In [51]:
df1['Borough'] = result

##  Handling "Neighborhood" who are not assigned

In [52]:
df1.loc[df1['Neighborhood'] == 'Not assigned', ['Neighborhood']] = df1['Borough']

In [53]:
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Printing the shape of the final processed dataframe

In [54]:
df1.shape

(103, 3)

# Inserting the goe spatial data against "PostalCode" 

In [55]:
# so we will use the csv sheet provided by Coursera as an alternative
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')

In [56]:
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [58]:
geo_df.rename(columns={'Postal Code':'PostalCode'},inplace=True)
geo_merged = pd.merge(geo_df, df1, on='PostalCode')

In [59]:
geo_merged.head()

Unnamed: 0,PostalCode,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Rouge,Malvern"
1,M1C,43.784535,-79.160497,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


In [60]:
# correcting the sequence of data
geo_data=geo_merged[['PostalCode','Borough','Neighborhood','Latitude','Longitude']]

In [61]:
geo_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [63]:
geo_data.shape

(103, 5)