## Segmenting and clustering neighboorhoods in Toronto

In [17]:
# importing dependencies
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests

## Set up the url and access the content on wikipedia

The first method will use BeautifulSoup to scrape the webpage

In [6]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url)
page = bs(req.content)
table = page.find('table')
rows = table.findAll('tr')
table_data = {}
idx = 0

for r in rows:
    row_data = r.findAll('td')
    if row_data:
        table_data.update({
            idx:[
                row_data[0].text, 
                row_data[1].text, 
                row_data[2].text.replace("\n",""),
            ]
        })
        idx+=1

In [7]:
column_names = ['Postal Code', 'Borough', 'Neighbourhood']

# Create the dataframe

df = pd.DataFrame.from_dict(data = table_data, orient='index', columns = column_names)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A\n,Not assigned\n,Not assigned
1,M2A\n,Not assigned\n,Not assigned
2,M3A\n,North York\n,Parkwoods
3,M4A\n,North York\n,Victoria Village
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront"


## Different method: Pandas only

This time, using only pandas and the pd.read_html() method. 

In [23]:
# Another method

df_2 = pd.read_html(str(table), header = 0)[0]
df_2.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [24]:
# Ignoring the "Not assigned" rows

df_2 = df_2[df_2.Borough != 'Not assigned']
df_2 = pd.DataFrame(df_2.groupby(['Postal Code', 'Borough'])['Neighborhood'].apply(', '.join))
df_2 = df_2.reset_index()
df_2.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [28]:
# Checking the M5A Postal code
print('Verify that the \'M5A\'-case is correct : \n{}\n '.format(df_2[df_2['Postal Code'] == 'M5A']))
df_2.Neighborhood[df_2.Neighborhood == 'Not assigned'] = df_2.Borough
print('verify the Neighborhood \'Not assigned\' method:\n{}\n'.format(df_2[df_2.Borough == 'Queen\'s Park']))
df_2.head(10)

Verify that the 'M5A'-case is correct : 
   Postal Code           Borough               Neighborhood
53         M5A  Downtown Toronto  Regent Park, Harbourfront
 
verify the Neighborhood 'Not assigned' method:
Empty DataFrame
Columns: [Postal Code, Borough, Neighborhood]
Index: []



Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Last requirement: 

Use the .shape method to print the number of rows of your dataframe.use the .shape method to print the number of rows of your dataframe.

In [29]:
print('The number of rows of the dataframe is {}'.format(df_2.shape[0]))

The number of rows of the dataframe is 103
