### This project is about clustering neighborhoods in Toronto using the Wikipedia website

1- Import the appropriate libraries

In [4]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import requests
from bs4 import BeautifulSoup

2- Prepare the web scraping code by utilizing BeautifulSoup

In [5]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).content
content = BeautifulSoup(requests.get(url).content, 'lxml')

In [6]:
table = content.find('table')
td = table.find_all('td')
postcode = []
borough = []
neighbourhood = []

3- Create a list with the scraped data

In [7]:
for i in range(0, len(td), 3):
    postcode.append(td[i].text.strip())
    borough.append(td[i+1].text.strip())
    neighbourhood.append(td[i+2].text.strip())

4- Create the actual DataFrame with the lists previously scraped and give the columns appropriate names

In [8]:
df_codes = pd.DataFrame(data=[postcode, borough, neighbourhood]).transpose()
df_codes.columns = ['PostalCode', 'Borough', 'Neighborhood']

5- Ignore boroughs with the 'Not assigned' value

In [14]:
df_codes['Borough'].replace('Not assigned', np.nan, inplace=True)
df_codes.dropna(subset=['Borough'], inplace=True)

6- Combine neighborhoods if they exist in one postal code

In [21]:
df_codes = df_codes.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_codes.columns = ['PostalCode', 'Borough', 'Neighborhood']

7- Assign the borough name to the neighborhood name  if the cell has a borough but a 'Not assigned' neighborhood value

In [None]:
df_codes['Neighborhood'].replace('Not assigned', "Queen's Park", inplace=True)

In [19]:
df_codes.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


8- Use the .shape method to print the number of dataframe rows

In [20]:
df_codes.shape

(103, 3)