<a href="https://colab.research.google.com/github/MaguireMaName/Coursera_Capstone/blob/master/Machine_Learning_w_Python_Geocoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [71]:
!pip install geocoder



In [0]:
# bring in dependencies 
import geocoder
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests as rq

## Machine Learning with Python: Dataframe of postal code, neighborhood, & borough
*For the Applied Data Science Capstone Project*

In [73]:
# define url for scraping and print

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
print(url)

https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M


In [0]:
response = rq.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

#print(soup)

In [0]:
table = soup.find('table', {'class':'wikitable sortable'}).tbody
#print(table)

rows = table.find_all('tr')

columns = [v.text.replace('\n','') for v in rows[0].find_all('th')]

df_a = pd.DataFrame(columns=columns)

for i in range(1, len(rows)):
    tds = rows[i].find_all('td')
    
    if len(tds) ==4:
      values = [tds[0].text, tds[1].text,'', tds[2].text, tds[3].text.replace('\n','').replace('\xa0','')]
    else:
      values = [td.text.replace('\n','').replace('\xa0','') for td in tds]
    
    df_a = df_a.append(pd.Series(values, index=columns), ignore_index=True)


In [76]:
# dimensions before aggregation

df_a.shape

(288, 3)

In [0]:
# aggregate data

df_b = df_a.groupby(['Postcode','Borough']).agg(lambda x: x.tolist()).reset_index()

In [78]:
df_b.shape

(180, 3)

In [79]:
# check results

df_b.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,[Not assigned]
1,M1B,Scarborough,"[Rouge, Malvern]"
2,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
3,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
4,M1G,Scarborough,[Woburn]


In [0]:
# where neighbourhood is not assigned, replace it with borough

df_b['Neighbourhood'] = np.where(df_b['Neighbourhood'] == "Not assigned", df_b['Borough'], df_b['Neighbourhood'])


In [81]:
# exception table

x_neighbourhood = df_b.loc[(df_b['Neighbourhood'] == "Not assigned")]
x_neighbourhood.shape

(0, 3)

In [82]:
# exception table

x_borough = df_b.loc[(df_b['Borough'] == "Not assigned")]
x_borough.shape

(77, 3)

In [0]:
# don't process obs. where borough = 'Not assigned'

df_c = df_b.drop(df_b[df_b.Borough == "Not assigned"].index)


In [84]:
# exception table

x_borough = df_c.loc[(df_b['Borough'] == "Not assigned")]
x_borough.shape

(0, 3)

In [85]:
# dimensions after aggregation
df_c.shape

(103, 3)

In [0]:
postcode = "Postcode"
borough = "Borough"
neighbourhood = "Neighbourhood"

In [0]:
start_index = 0
status_rate = 100
write_data_rate = 1000
attempts_to_geocode = 3
wait_time = 3

In [0]:
if neighbourhood not in df_c.columns:
  raise ValueError("Can't find neighbourhood column in dataframe")

if borough not in df_c.columns:
  raise ValueError("Can't find borough column in dataframe") 

In [0]:
if(postcode):
  if postcode not in df_c.columns:
    raise ValueError("Can't find postcode column in dataframe")    
  addresses = (df_c[neighbourhood].astype(str) + ", " + df_c[borough].astype(str) + ", " + df_c[postcode].astype(str)).tolist()
else:
  addresses = (df_c[neighbourhood].astype(str) + ", " + df_c[borough].astype(str)).tolist()

In [99]:
addresses

["['Rouge', 'Malvern'], Scarborough, M1B",
 "['Highland Creek', 'Rouge Hill', 'Port Union'], Scarborough, M1C",
 "['Guildwood', 'Morningside', 'West Hill'], Scarborough, M1E",
 "['Woburn'], Scarborough, M1G",
 "['Cedarbrae'], Scarborough, M1H",
 "['Scarborough Village'], Scarborough, M1J",
 "['East Birchmount Park', 'Ionview', 'Kennedy Park'], Scarborough, M1K",
 "['Clairlea', 'Golden Mile', 'Oakridge'], Scarborough, M1L",
 "['Cliffcrest', 'Cliffside', 'Scarborough Village West'], Scarborough, M1M",
 "['Birch Cliff', 'Cliffside West'], Scarborough, M1N",
 "['Dorset Park', 'Scarborough Town Centre', 'Wexford Heights'], Scarborough, M1P",
 "['Maryvale', 'Wexford'], Scarborough, M1R",
 "['Agincourt'], Scarborough, M1S",
 '[\'Clarks Corners\', \'Sullivan\', "Tam O\'Shanter"], Scarborough, M1T',
 '[\'Agincourt North\', "L\'Amoreaux East", \'Milliken\', \'Steeles East\'], Scarborough, M1V',
 '["L\'Amoreaux West"], Scarborough, M1W',
 "['Upper Rouge'], Scarborough, M1X",
 "['Hillcrest Village