In [6]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print("Libraries imported.")

Libraries imported.


In [43]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [44]:
soup = BeautifulSoup(data, 'html.parser')

In [45]:
# create three lists to store table data
postalCodeList = []
boroughList = []
neighborhoodList = []

In [46]:
# append the data into the respective lists
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text) # avoid new lines in neighborhood cell

In [57]:
# create a new DataFrame from the three lists
toronto_df = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


In [63]:
# drop cells with a borough that is Not assigned
toronto_df_dropna = toronto_df[toronto_df.Borough != "Not assigned\n"].reset_index(drop=True)
toronto_df_dropna.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods\n
1,M4A\n,North York\n,Victoria Village\n
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights\n"
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n"


In [68]:
toronto_df_dropna["PostalCode"] = toronto_df_dropna["PostalCode"].str.replace("\n","")
toronto_df_dropna.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,North York\n,Parkwoods\n
1,M2A,North York\n,Victoria Village\n
2,M3A,Downtown Toronto\n,"Regent Park, Harbourfront\n"
3,M4A,North York\n,"Lawrence Manor, Lawrence Heights\n"
4,M5A,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n"


In [69]:
toronto_df_dropna["Borough"] = toronto_df_dropna["Borough"].str.replace("\n","")
toronto_df_dropna.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,North York,Parkwoods\n
1,M2A,North York,Victoria Village\n
2,M3A,Downtown Toronto,"Regent Park, Harbourfront\n"
3,M4A,North York,"Lawrence Manor, Lawrence Heights\n"
4,M5A,Downtown Toronto,"Queen's Park, Ontario Provincial Government\n"


In [70]:
toronto_df_dropna["Neighborhood"] = toronto_df_dropna["Neighborhood"].str.replace("\n","")
toronto_df_dropna.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,North York,Parkwoods
1,M2A,North York,Victoria Village
2,M3A,Downtown Toronto,"Regent Park, Harbourfront"
3,M4A,North York,"Lawrence Manor, Lawrence Heights"
4,M5A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [71]:
# group neighborhoods in the same borough
toronto_df_grouped = toronto_df_dropna.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,North York,Parkwoods
1,M1B,Downtown Toronto,"Garden District, Ryerson"
2,M1C,Scarborough,"Guildwood, Morningside, West Hill"
3,M1E,North York,Hillcrest Village
4,M1G,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands"


In [75]:
# for Neighborhood="Not assigned", make the value the same as Borough
for index, row in toronto_df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,North York,Parkwoods
1,M1B,Downtown Toronto,"Garden District, Ryerson"
2,M1C,Scarborough,"Guildwood, Morningside, West Hill"
3,M1E,North York,Hillcrest Village
4,M1G,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands"


In [74]:
toronto_df_grouped.shape

(103, 3)