## Explore Toronto

### Part 1

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [3]:
# Scrape the website for the table and assign the rows from the table
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_doc = requests.get(url).text
soup = BeautifulSoup(html_doc, 'html.parser')
html_table = soup.find('table')
table_rows = html_table.find_all('tr')

# Add the rows into a dataframe
l = []
for tr in table_rows:  
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
toronto = pd.DataFrame(l, columns=["PostalCode", "Borough","Neighborhood"])

# Remove unwanted rows and characters
toronto = toronto[toronto.Borough != 'Not assigned']
toronto.drop([0], axis=0, inplace=True)
toronto['Neighborhood'] = toronto['Neighborhood'].str.rstrip('\n')

# Group rows by postal code and aggregate the neighborhoods
toronto = toronto.groupby(['PostalCode', 'Borough']).agg([('neighborhood', ', '.join)])
toronto.reset_index(inplace=True)
toronto.columns = ['PostalCode', 'Borough', 'Neighborhood']

# Rename neighborhoods that are "not assigned" with the name of the borough
import numpy as np
toronto['Neighborhood'] = np.where(toronto['Neighborhood'] == 'Not assigned', toronto['Borough'], toronto['Neighborhood'])
toronto.loc[85]

# Have a look at the resulting dataframe
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Part 2

**Load coordinates data into a dataframe**

In [4]:
!wget -O Geospatial_data.csv http://cocl.us/Geospatial_data
coords = pd.read_csv("Geospatial_data.csv", delimiter=",")
coords.head()

--2019-05-07 07:10:35--  http://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.48.113.201
Connecting to cocl.us (cocl.us)|169.48.113.201|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cocl.us/Geospatial_data [following]
--2019-05-07 07:10:35--  https://cocl.us/Geospatial_data
Connecting to cocl.us (cocl.us)|169.48.113.201|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-05-07 07:10:36--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.26.197
Connecting to ibm.box.com (ibm.box.com)|107.152.26.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-05-07 07:10:36--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjc

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


**Set postal code as index for both data frames**

In [5]:
coords.set_index('Postal Code', inplace=True)
toronto.set_index('PostalCode', inplace=True)

**Add latitude and longitude column to original dataframe and reset the index**

In [6]:
toronto['Latitude'] = coords['Latitude']
toronto['Longitude'] = coords['Longitude']
toronto.reset_index(inplace=True)
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
