### Importing Libraries ###

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#import folium # map rendering library

print("Libraries imported.")

Libraries imported.


In [2]:
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 12.4MB/s eta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [3]:
import folium
print('folium imported')

folium imported


### Scraping data from Webpage ###

In [74]:
URL="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [75]:
html_content=requests.get(URL).text

In [76]:
soup=BeautifulSoup(html_content,'html.parser')

In [77]:
#print(soup.prettify()) # data in html format parsed

In [78]:
print(soup.title.text) #finding table title

List of postal codes of Canada: M - Wikipedia


In [79]:
# creating empty lists to store table values as lists

In [80]:
postcode_L=[]
borough_L=[]
neighbourhood_L=[]

In [81]:
soup.find('table').find_all('tr') #finding table

[<tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>, <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M3A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td></tr>, <tr>
 <td>M4A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td></tr>, <tr>
 <td>M5A</td>
 <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
 <td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
 </td></tr>, <tr>
 <td>M6A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
 </td></tr>, <tr>
 <td>M6A</td>
 <td><a href="/wiki/North_Yor

In [82]:
soup.find('table').find_all('tr') #finding rows

[<tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>, <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M3A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td></tr>, <tr>
 <td>M4A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td></tr>, <tr>
 <td>M5A</td>
 <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
 <td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
 </td></tr>, <tr>
 <td>M6A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
 </td></tr>, <tr>
 <td>M6A</td>
 <td><a href="/wiki/North_Yor

In [83]:
#finding cells:

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postcode_L.append(cells[0].text)
        borough_L.append(cells[1].text)
        neighbourhood_L.append(cells[2].text.strip('\n'))
        

In [84]:
# List to dataframe:
df=pd.DataFrame({"Post Code":postcode_L,"Neighbourhood":neighbourhood_L,"Borough":borough_L})

In [85]:
df #displaying dataframe

Unnamed: 0,Post Code,Neighbourhood,Borough
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,Parkwoods,North York
3,M4A,Victoria Village,North York
4,M5A,Harbourfront,Downtown Toronto
5,M6A,Lawrence Heights,North York
6,M6A,Lawrence Manor,North York
7,M7A,Queen's Park,Downtown Toronto
8,M8A,Not assigned,Not assigned
9,M9A,Islington Avenue,Etobicoke


## Data Wrangling ##

In [86]:
df.shape #checking size before dropping cells

(287, 3)

## 1. Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.##

In [87]:
df_NAdropped=df[df['Borough']!="Not assigned"]

In [88]:
df_NAdropped #displaying dataframe without borough with not assigned cells

Unnamed: 0,Post Code,Neighbourhood,Borough
2,M3A,Parkwoods,North York
3,M4A,Victoria Village,North York
4,M5A,Harbourfront,Downtown Toronto
5,M6A,Lawrence Heights,North York
6,M6A,Lawrence Manor,North York
7,M7A,Queen's Park,Downtown Toronto
9,M9A,Islington Avenue,Etobicoke
10,M1B,Rouge,Scarborough
11,M1B,Malvern,Scarborough
13,M3B,Don Mills North,North York


In [89]:
df_NAdropped.shape #size is changed after dropping 

(210, 3)

## 2. More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table. ##

In [90]:
df_grouped=df_NAdropped.groupby(['Post Code', 'Borough'], as_index=False).agg(lambda x: ",".join(x))

In [91]:
df_grouped

Unnamed: 0,Post Code,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


## 3.If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.##

In [92]:
for index,row in df_grouped.iterrows():
    if(row["Neighbourhood"]=="Not assigned"):
        row["Neighbourhood"]=row["Borough"]


In [93]:
df_grouped

Unnamed: 0,Post Code,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


## 4. Shape of DataFrame##

In [94]:
df_grouped.shape

(103, 3)

## 5.Loading csv file into dataframe ##

In [96]:
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_ddbc86de65744c1d81b5bd0b750a9ac8 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='Nu0Y7m9LGa8ZT9WOp6836u_YmOw-Y20hlszsg6BiKS_L',
    ibm_auth_endpoint="https://iam.eu-gb.bluemix.net/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.eu-geo.objectstorage.service.networklayer.com')

body = client_ddbc86de65744c1d81b5bd0b750a9ac8.get_object(Bucket='toronto-donotdelete-pr-fml7v4fc9lcbas',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

# If you are reading an Excel file into a pandas DataFrame, replace `read_csv` by `read_excel` in the next statement.
df_data_0 = pd.read_csv(body)
df_data_0.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
