In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library
import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 

In [2]:
# installing (if necessary) and applying  library for scraping
!conda install -c conda-forge beautifulsoup4 --yes

In [3]:
from bs4 import BeautifulSoup

### Installing a HTML and lxml parsers 

In [4]:
# if necessary
!conda install -c conda-forge lxml --yes
!conda install -c conda-forge html5lib --yes

### Getting needed data from Wikipedia webpage and scraping it with BeautifulSoup

In [5]:
html_doc = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [6]:
soup = BeautifulSoup(html_doc, 'lxml')

In [7]:
table = soup.find('table',{'class':'wikitable sortable'})


### Preparing Pandas' Dataframe

In [8]:
column =[ 'PostalCode', 'Borough',  'Neighbourhood']
Toronto = pd.DataFrame(columns = column)
Toronto

Unnamed: 0,PostalCode,Borough,Neighbourhood


### Using loop to get all the informationt into the Toronto Dataframe

In [9]:
# Loopin over table rows (tr tag) and getting the text of cells (td tag)
# Cleaning texts from unnecessary signs ("\n") etc.
# Getting rid of rows where " borough" is not assigned.
# If a "neighborhood_name" is not assigned, replacing it with the name from "borough"
# Inserting everything into the dataframe

for tr in table.find_all('tr')[1:]:
    tds = tr.find_all('td')        
    borough = tds[1].text
    if 'Not assigned' in borough:
        continue
    neighborhood_name = tds[2].text
    if '\n' in neighborhood_name:
        neighborhood_name = neighborhood_name.rstrip('\n')
    if 'Not assigned' in neighborhood_name:
        neighborhood_name = borough
    post = tds[0].text
    Toronto = Toronto.append({'Borough': borough,
                                          'Neighbourhood': neighborhood_name,
                                          'PostalCode': post}, ignore_index=True)

In [10]:
Toronto.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


### Adjusting the dataframe so that all rows with same "PostalCode" will be put together and "Neighbourhoods" will be split with ",".

In [11]:
Toronto_grp = Toronto.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [12]:
Toronto_grp.head(15)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [13]:
Toronto_grp.shape


(103, 3)