# Segmenting and Clustering Neighborhoods in Toronto
#### Done by H-Snoussi
#### BIG Thank you for your time to review my assignment!

## 1. Importing libraries

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## 2. Scrape the table from Wikipedia.org

In [7]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r  = requests.get(url)
data = r.text
soup = BeautifulSoup(data)
table = soup.find('table', attrs={'class': 'wikitable sortable'})

#### 3. Create a table as dataframe

In [14]:
df = pd.DataFrame(columns = ['PostalCode', 'Borough', 'Neighborhood'])

for tr in table.find_all('tr')[1:]:
    tds = tr.find_all('td')
    row = [tr.text.strip() for tr in tds if tr.text.strip()]
    df = df.append({'PostalCode': row[0],
                                        'Borough': row[1],
                                        'Neighborhood': row[2]}, ignore_index = True)
print(df.shape)    
df.head()

(287, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## 4.Drop rows without assigned Borough

In [15]:
df_drop = df[df['Borough'] != 'Not assigned']
print(df_drop.shape)    
df_drop.head()

(210, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


## 5.Combine neighborhoods with similar code postal

In [25]:
df_drop_neigh = df_drop.groupby(['PostalCode', 'Borough']).agg(', '.join)
df_drop_neigh.reset_index(inplace = True)
print(df_drop_neigh.shape)
df_drop_neigh.head(10)

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## 6. Make Not assigned neighborhood equal to Borough

In [31]:
df_drop_neigh_notassigned = df_drop_neigh['Neighborhood'] == 'Not assigned'
df_drop_neigh_notassigned.head()

0    False
1    False
2    False
3    False
4    False
Name: Neighborhood, dtype: bool

In [35]:
df_drop_neigh.loc[df_drop_neigh_notassigned,'Neighborhood']=df_drop_neigh.loc[df_drop_neigh_notassigned,'Borough']
df_drop_neigh.loc[df_drop_neigh_notassigned]

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


## 7. Number of rows and columns of the dataframe

In [37]:
print(df_drop_neigh.shape)

(103, 3)


In [46]:
df_drop_neigh

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
