# IBM Data Science
## Coursera Capstone notebook Week 3
This notebook will be used to analyse location data in Toronto for the capstone project of the IBM Data Science course.

**All 3 parts are in this notebook - please scroll to the appropriate part**

### Part 1

In [32]:
"""install the necessary packages"""
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json 
import numpy as np
#!pip install beautifulsoup4 ## These are commented out as the packages are now installed
#!pip install lxml
#!conda install -c conda-forge geopy --yes
#!pip install requests
from bs4 import BeautifulSoup as bs
import requests

In [86]:
"""use beautiful soup to import the data"""
source_html = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text #obtains source code as text
soup = bs(source_html, 'lxml') #uses beautiful soup to parse the source code
# print(soup.prettify()) #prints the html with appropriate indents - this was used to identify which arguments to use to find the table etc.
wikitable = soup.tbody #accesses just the table

In [107]:
"""Parses a html segment started with tag <table> followed 
    by multiple <tr> (table rows) and inner <td> (table data) tags. 
    It returns a list of rows with inner columns. 
    Accepts only one <th> (table header/data) in the first row.
    """
def tableDataText(table):   
    def rowgetDataText(tr, coltag='td'): # td (data) or th (header)       
        return [td.get_text(strip=True) for td in tr.find_all(coltag)]  
    rows = []
    trs = table.find_all('tr')
    headerow = rowgetDataText(trs[0], 'th')
    if headerow: # if there is a header row include first
        rows.append(headerow)
        trs = trs[1:]
    for tr in trs: # for every table row
        rows.append(rowgetDataText(tr, 'td') ) # data row       
    return rows

In [108]:
wikiclean = tableDataText(wikitable) #apply the method above to our table from wikipedia

nbhd = pd.DataFrame(wikiclean[1:], columns=wikiclean[0]) #convert to a dataframe

nbhd = nbhd[nbhd.Borough != 'Not assigned']#remove any boroughs with 'not assigned'
nbhd.Neighbourhood.value_counts(dropna=True)#from this we can see there is now only 1 neighborhood that is not assigned


St. James Town                                       2
Runnymede                                            2
Eringate                                             1
Rathnelly                                            1
Grange Park                                          1
Martin Grove                                         1
Little Portugal                                      1
Swansea                                              1
Mimico NE                                            1
St. Phillips                                         1
Woodbine Gardens                                     1
Guildwood                                            1
The Queensway East                                   1
Island airport                                       1
Port Union                                           1
Agincourt                                            1
Cliffside                                            1
Alderwood                                            1
Sunnylea  

In [109]:
nbhd[nbhd["Neighbourhood"]=="Not assigned"]

Unnamed: 0,Postcode,Borough,Neighbourhood
9,M9A,Queen's Park,Not assigned


In [110]:
nbhd = nbhd.replace(to_replace="Not assigned",value="Queen's Park") #replace not assigned with borough name
nbhd = nbhd.sort_values(by=['Postcode']) #sorts alphabetically
nbhd = nbhd.reset_index(drop=True) #resets index
nbhd.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Port Union
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Highland Creek


In [113]:
nbhd.shape

(210, 3)