# Coursera IBM Capstone Projects: Locations in Toronto's Neighborhoods
## Capstone Assignment week 3 (scraping web data, applying location api, clustering, visualization)

### Stage 1: gather postal codes from Wikipedia 

In [1]:
# import the necessary libraries and setup pandas set_option
import requests
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', None)
import numpy as np

The postal code data frame needs the columns: 'PostalCode', 'Borough', 'Neighborhood'.
Postal codes beginning with 'M' belong to the city of Toronto, see: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [2]:
# get the html file
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
file = requests.get(wiki_url).text

In [None]:
# use Beautifulsoup to extract necessary information
soup = BeautifulSoup(file, 'html5lib')

# get wikipedia table, postal codes are stored in the first <tbody></tbody>
wiki_table = soup.find('tbody')

# extract text and append to a new list, create a dictionary with the data, and finally create a pandas.DataFrame from the dictionary
code_list = []
for td in wiki_table.findAll('td'):
    code_list.append(td.text.strip())

code_dic = {}
code_dic['PostalCode'] = code_list[0::3]
code_dic['Borough'] = code_list[1::3]
code_dic['Neighborhood'] = code_list[2::3]
    
postal_df = pd.DataFrame(code_dic, columns = ['PostalCode', 'Borough', 'Neighborhood'])
postal_df.shape

In [None]:
# clean the data fram
# drop rows whose value is 'Not assigned' =  Ignore cells with a borough that is Not assigned.
postal_df = postal_df[postal_df.Borough != 'Not assigned']

# combine rows with same PostalCode and concatenate the values in Neighborhood
postal_df = postal_df.groupby(['PostalCode', 'Borough'], sort=True, as_index=False).agg(lambda x: ', '.join(x.dropna()))

# replace the Neighborhood column with Borough if Neighborhood is not assigned
postal_df['Neighborhood'] = np.where(postal_df['Neighborhood'] == 'Not assigned', postal_df['Borough'], postal_df['Neighborhood'])
postal_df

In [None]:
# print the shape of the data frame
postal_df.shape