# IBM Data Science
## Coursera Capstone notebook Week 3
This notebook will be used to analyse location data in Toronto for the capstone project of the IBM Data Science course.

**All 3 parts are in this notebook - please scroll to the appropriate part**

### Part 1: Setting up the notebook

In [198]:
"""install the necessary packages"""
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json 
import numpy as np
#!pip install beautifulsoup4 ## These are commented out as the packages are now installed
#!pip install lxml
#!conda install -c conda-forge geopy --yes
#!pip install requests
from bs4 import BeautifulSoup as bs
import requests
pd.set_option("display.precision", 3)

In [199]:
"""use beautiful soup to import the data"""
source_html = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text #obtains source code as text
soup = bs(source_html, 'lxml') #uses beautiful soup to parse the source code
# print(soup.prettify()) #prints the html with appropriate indents - this was used to identify which arguments to use to find the table etc.
wikitable = soup.tbody #accesses just the table

In [200]:
"""Parses a html segment started with tag <table> followed 
    by multiple <tr> (table rows) and inner <td> (table data) tags. 
    It returns a list of rows with inner columns. 
    Accepts only one <th> (table header/data) in the first row.
    """
def tableDataText(table):   
    def rowgetDataText(tr, coltag='td'): # td (data) or th (header)       
        return [td.get_text(strip=True) for td in tr.find_all(coltag)]  
    rows = []
    trs = table.find_all('tr')
    headerow = rowgetDataText(trs[0], 'th')
    if headerow: # if there is a header row include first
        rows.append(headerow)
        trs = trs[1:]
    for tr in trs: # for every table row
        rows.append(rowgetDataText(tr, 'td') ) # data row       
    return rows

In [201]:
wikiclean = tableDataText(wikitable) #apply the method above to our table from wikipedia

nbhd = pd.DataFrame(wikiclean[1:], columns=wikiclean[0]) #convert to a dataframe

nbhd = nbhd[nbhd.Borough != 'Not assigned']#remove any boroughs with 'not assigned'
nbhd['Neighbourhood'] = nbhd['Neighbourhood'].replace("Not assigned",nbhd['Borough']) #replace not assigned with borough name

nbhd2 = nbhd.groupby(['Postcode'])['Neighbourhood'].apply(", ".join) #groups neighbourhood with same postcode, add comma between neighbourhood names 

nbhd2 = nbhd2.rename(index='Neighbourhoods',columns={'Neighbourhood':'Neighbourhoods'}) #change value title so can add to nbhd df
nbhd = nbhd.join(nbhd2,on='Postcode',how='inner') # joins the dfs using the post code as the index
nbhd = nbhd.drop(['Neighbourhood'],axis=1) #removes the original Neighbourhoods column
nbhd = nbhd.drop_duplicates() #removes the duplicate entries

nbhd = nbhd.sort_values(by=['Postcode']) #sorts alphabetically
nbhd = nbhd.reset_index(drop=True) #resets index
nbhd.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhoods
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [202]:
nbhd.shape

(103, 3)

### Part 2: obtaining latitude and longitude

In [203]:
# !pip install geocoder # install the necessary package
import geocoder # import geocoder

In [204]:
latlong = pd.DataFrame(columns = ['Lat','Long']) #Create DF for latlong data
latlong

Unnamed: 0,Lat,Long


In [206]:
postal_code = nbhd.Postcode #get list of postcodes
for i in postal_code:
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(i))  #using arcgis as google rejected the requests
                                                            #Please note some lat/longs might be different to the rubric
    lat = g.latlng[0]
    long = g.latlng[1]
    latlong = latlong.append({'Lat': lat,'Long': long},ignore_index=True) #fill lat long df

nbhd = pd.concat([nbhd,latlong],1)#combine the dfs

In [210]:
nbhd.head()

Unnamed: 0,Postcode,Borough,Neighbourhoods,Lat,Long
0,M1B,Scarborough,"Rouge, Malvern",43.812,-79.196
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.786,-79.159
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.766,-79.175
3,M1G,Scarborough,Woburn,43.768,-79.218
4,M1H,Scarborough,Cedarbrae,43.77,-79.239
