![alt text](https://cognitiveclass.ai/wp-content/themes/bdu3.0/static/images/cc-logo.png)

# Segmenting and Clustering Neighborhoods in Toronto

In [2]:
# Installing web-parsing libraries
!conda install -c conda-forge beautifulsoup4 --yes
!conda install -c conda-forge lxml --yes    # parser for html file

Collecting package metadata: done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs:
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    beautifulsoup4-4.7.1       |        py36_1001         140 KB  conda-forge
    conda-4.6.4                |           py36_0         877 KB  conda-forge
    ------------------------------------------------------------
                                           Total:        1017 KB

The following packages will be UPDATED:

  beautifulsoup4      anaconda::beautifulsoup4-4.7.1-py36_1 --> conda-forge::beautifulsoup4-4.7.1-py36_1001
  conda                                        4.6.3-py36_0 --> 4.6.4-py36_0



Downloading and Extracting Packages
beautifulsoup4-4.7.1 | 140 KB    | ##################################### | 100% 
conda-4.6.4          | 877 KB    | ###########

In [36]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

In [40]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

table = soup.find('table', class_='wikitable sortable')

In [102]:
table = soup.find('table', class_='wikitable sortable').text
len(table)

9385

In [86]:
table[0:300]

'\n\nPostcode\nBorough\nNeighbourhood\n\n\nM1A\nNot assigned\nNot assigned\n\n\nM2A\nNot assigned\nNot assigned\n\n\nM3A\nNorth York\nParkwoods\n\n\nM4A\nNorth York\nVictoria Village\n\n\nM5A\nDowntown Toronto\nHarbourfront\n\n\nM5A\nDowntown Toronto\nRegent Park\n\n\nM6A\nNorth York\nLawrence Heights\n\n\nM6A\nNorth York\nLawrence Manor\n\n\nM7A'

In [35]:
table[9000:-1]

'ssigned\nNot assigned\n\n\nM3Z\nNot assigned\nNot assigned\n\n\nM4Z\nNot assigned\nNot assigned\n\n\nM5Z\nNot assigned\nNot assigned\n\n\nM6Z\nNot assigned\nNot assigned\n\n\nM7Z\nNot assigned\nNot assigned\n\n\nM8Z\nEtobicoke\nKingsway Park South West\n\n\nM8Z\nEtobicoke\nMimico NW\n\n\nM8Z\nEtobicoke\nThe Queensway West\n\n\nM8Z\nEtobicoke\nRoyal York South West\n\n\nM8Z\nEtobicoke\nSouth of Bloor\n\n\nM9Z\nNot assigned\nNot assigned\n'

#### We found that after eliminating \n\n, we can split the string with \n to get the list of string

In [103]:
table = table.replace('\n\n','').split('\n')

#### Convert the list into a dataframe, -1 in reshape stands for unspecified number of row

In [104]:
Toronto_df = pd.DataFrame(np.array(table[3:]).reshape(-1,3),columns=['PostalCode','Borough','Neighborhood'])
Toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Filter out rows those Borough are Not assigned

In [105]:
Toronto_df = Toronto_df[Toronto_df['Borough']!='Not assigned'].reset_index(drop=True)
Toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### If Neighborhood is not assigned, it will be assigned the same name as borough

In [106]:
Toronto_df['Neighborhood'][Toronto_df['Neighborhood']=='Not assigned']=Toronto_df['Borough'][Toronto_df['Neighborhood']=='Not assigned']

#### Create new dataframe that neighborhoods with same postalcodes are in the same rows

In [107]:
# combine neighborhoods in a list for each postalcode
PostalCode_ = Toronto_df['PostalCode'].unique()
Borough_ = []
Neighborhood_ = []
for Postal in Toronto_df['PostalCode'].unique():
    Neigh_list = []
    Borough_list = []
    for ind in np.arange(Toronto_df.shape[0]):
        if Toronto_df.loc[ind,'PostalCode']==Postal:
            Neigh_list.append(Toronto_df['Neighborhood'][ind])
            Borough_list.append(Toronto_df['Borough'][ind])
    Neighborhood_.append(Neigh_list)
    Borough_.append(Borough_list[0])

Toronto_new_df = pd.DataFrame({'PostalCode':PostalCode_, 'Borough':Borough_, 'Neighborhood':Neighborhood_})
Toronto_new_df.head(10)

## Show number of rows of my dataframe

In [111]:
Toronto_new_df.shape[0]

103