# Welcome to the Coursera Notebook!

To get started we will need a to import some libraries for working with and a few more for scrapping the URL.

In [1]:
import numpy as np
import pandas as pd # library for data analysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
import geocoder # import geocoder

 # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')


Libraries imported.


In [2]:
#lxml will be the foundation for pd.read_html()
import lxml
from lxml import etree

from urllib.request import urlopen
import html5lib

print('import success!')

#Lets get some soup to help with scrapping HTML URL
from bs4 import BeautifulSoup
print('bs4 has arrived')


import success!
bs4 has arrived


Now that we have our libraries import, lets scrape the URL from wikipedia and get the table we want.
From reviewing the wiki page I can see that there are a number of tables on the same page. To avoid pulling them all into a list, we will call the first table named "Neighbourhood" (I personally enjoy how they spelt 'Neighbourhood' right ;-D )

In [3]:
url= "http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
toronto_hood_codes = pd.read_html(url, match= "Neighbourhood")

hmmm, not in a dataframe yet as the read_html pulls in a string...So what shape do we have here? 

In [4]:
len(toronto_hood_codes)

1

Lets get this into a table

In [5]:
df = pd.DataFrame(toronto_hood_codes[0])
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
df.shape

(180, 3)

Now lets deal with those missing values as we dont want those in our data.

In [7]:
df.replace('Not assigned', np.NaN, inplace=True )
print("lets see how this looks!")

lets see how this looks!


In [8]:
df.dropna(inplace=True)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [9]:
#To deal with the missing index rows we can reset the index to count from 0 again
df.reset_index(drop=True, inplace=True)

In [10]:
print(df.shape)
df

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [11]:
# initialize your variable to None
#lat_lng_coords = None
#
# loop until you get the coordinates
#while(lat_lng_coords is None):
#  g = geocoder.google('{}, Toronto, Ontario'.format('Postal Code'))
#  lat_lng_coords = g.latlng
#    
#      #g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#
##latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]
#
#latitude

In [12]:
file = 'http://cocl.us/Geospatial_data'
df_lat_lng = pd.read_csv(file, header=[0])
df_lat_lng

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [13]:
torohood = pd.merge(df, df_lat_lng, on='Postal Code')
torohood

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [16]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [19]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
map_toronto

In [22]:
for lat, lng, neighbourhood, borough in zip(torohood['Latitude'], torohood['Longitude'], torohood['Neighbourhood'], torohood['Borough']):
    label= '{}, {}'.format(neighbourhood, borough)
    label= folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_capacity=0.7,
        parse_html=False).add_to(map_toronto)
map_toronto

In [41]:
toronto_center = torohood[torohood['Borough'].str.endswith('Toronto')].reset_index(drop=True)
toronto_center

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259
