#### First install Beautiful Soup package, used for performing "web-scraping" operations...

#### (Note that we install the latest Beautiful Soup package, version 4, using LXML parser) 

In [1]:
!conda install -c conda-forge  beautifulsoup4  --yes  

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
beautifulsoup4            4.6.3                    py35_0    conda-forge


In [2]:
!conda install -c conda-forge  lxml  --yes

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following packages will be UPDATED:

    libxml2: 2.9.4-h6b072ca_5     --> 2.9.8-h422b904_2     conda-forge
    libxslt: 1.1.29-hcf9102b_5    --> 1.1.32-h88dbc4e_2    conda-forge
    lxml:    4.1.0-py35ha401a81_0 --> 4.2.5-py35hc9114bc_0 conda-forge

libxml2-2.9.8- 100% |################################| Time: 0:00:00   2.85 MB/s
libxslt-1.1.32 100% |################################| Time: 0:00:00   8.44 MB/s
lxml-4.2.5-py3 100% |################################| Time: 0:00:00  12.97 MB/s


#### Now import the necessary Python Libraries...

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [2]:
# Create Pandas dataframe to store Toronto neighborhood data
# Only have three columns: PostalCode, Borough, and Neighborhood

# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood']

# instantiate the dataframe
df_neighborhoods = pd.DataFrame(columns=column_names)

# take a look at the empty dataframe, to check that columns are correctly named
df_neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood


# 1. Perform Web-Scraping and Explore Resulting Dataset

#### Now use the Python requests library to read the contents of the Wikipedia web site as a string of HTML code

#### This HTML code string will then be parsed using the Beautiful Soup library (with XML parse module)

In [3]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
#print(source)
soup = BeautifulSoup(source, 'lxml')
#print(soup.prettify())


#### We now use the structure of the HTML code to find the postal code, borough and neighborhood data.

#### Note that it is necessary to examine HTML code to see how it should be parsed to find this data...

In [4]:
# now search for the PostalCode, Borough and Neighborhood data in the HTML data
body = soup.find('body')
#print(body.prettify())
table = body.find('table', class_='wikitable sortable')
#print(table.prettify())
table_data = table.tbody.find_all('tr')
# skip first occurrence, as that is just header data
for i in range(1, len(table_data)):
    data = table_data[i].text.split('\n')
    postcode = data[1]
    borough = data[2]
    neighborhood = data[3]

#### Now that we have parsed this data from the web-site HTML code, it will be used to populate the Pandas dataframe.

#### Note that if a borough is not assigned, then data is skipped; if a neighborhood is not assigned, then it gets borough name.

In [5]:
# Now read through this table data, to assign data to dataframe
# Note that need to use dictionary to build up list of neighborhoods for each postal code;
# the dictionary key is the postal code and the dictionary value is list of neighborhoods
neighborhood_dict = {}
borough_dict = {}
for i in range(1, len(table_data)):
    data = table_data[i].text.split('\n')
    postcode = str(data[1])
    borough = str(data[2])
    neighborhood = str(data[3])
    if borough == 'Not assigned':
        continue
    elif  neighborhood == 'Not assigned':
        neighborhood = borough
    if not(postcode in neighborhood_dict.keys()):
        neighborhood_dict[postcode] = []
    if not(neighborhood in neighborhood_dict[postcode]):
        neighborhood_dict[postcode].append(neighborhood)
    if not(postcode in borough_dict.keys()):
        borough_dict[postcode] = ""
    if len(borough) > 0:
        borough_dict[postcode] = borough   

#### Now add this information to the pandas dataframe, converting list of neighborood names into comma-separated strings

In [6]:
# now add this data to the dataframe
key_list = list(neighborhood_dict.keys())
data_list = []
for i in range(len(key_list)):
    data_dict = {}
    data_dict['PostalCode'] = key_list[i]
    data_dict['Borough'] = borough_dict[key_list[i]]
    # need to convert list of strings into a single comma-separated string
    if len(neighborhood_dict[key_list[i]]) > 1:
       data_dict['Neighborhood'] = ", ".join(neighborhood_dict[key_list[i]]) 
    else:     
       data_dict['Neighborhood'] = neighborhood_dict[key_list[i]][0]
    data_list.append(data_dict)
    
df_neighborhoods = pd.DataFrame(data_list)
df_neighborhoods = df_neighborhoods[['PostalCode', 'Borough', 'Neighborhood']]
df_neighborhoods.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M4P,Central Toronto,Davisville North
1,M3A,North York,Parkwoods
2,M5C,Downtown Toronto,St. James Town
3,M6H,West Toronto,"Dovercourt Village, Dufferin"
4,M6C,York,Humewood-Cedarvale
5,M9N,York,Weston
6,M2H,North York,Hillcrest Village
7,M2K,North York,Bayview Village
8,M4J,East York,East Toronto
9,M4A,North York,Victoria Village


#### Finally, display the number of rows in our pandas dataframe...

In [7]:
# let's find the number of rows in our pandas dataframe
print("The number of rows in our pandas dataframe is:", df_neighborhoods.shape[0])

The number of rows in our pandas dataframe is: 103


#### Now let's add Latitude and Longitude columns into the pandas dataframe...

In [8]:
df_neighborhoods.insert(3, 'Latitude', '')
df_neighborhoods.insert(4, 'Longitude', '')
df_neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4P,Central Toronto,Davisville North,,
1,M3A,North York,Parkwoods,,
2,M5C,Downtown Toronto,St. James Town,,
3,M6H,West Toronto,"Dovercourt Village, Dufferin",,
4,M6C,York,Humewood-Cedarvale,,


#### Note that we are forced to use CSV file containing Latitude and Longitude values for Toronto area

#### Although much time was spent working with Geocoder, it could not provide this data for us... 

In [9]:
# read in latitude and longitude values from CSV file
!wget  -q -O 'Toronto_Lat_Long.csv'  https://cocl.us/Geospatial_data
print("Geospatial Data Successfully downloaded...")


Geospatial Data Successfully downloaded...


#### Now read the Latitude/Longitude data from the CSV file into our Pandas dataframe...

In [10]:
import csv

df_neighborhoods.set_index('PostalCode', inplace=True)

with open('Toronto_Lat_Long.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter = ',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            # skip first row, since it is just header information
            pass
        else:
            postalCode = str(row[0])
            df_neighborhoods.loc[[postalCode], ['Latitude']]  = float(row[1])
            df_neighborhoods.loc[[postalCode], ['Longitude']] = float(row[2])
        line_count = line_count + 1
csv_file.close()
            
        

#### Look at the first few rows of our dataframe, to check that the Latitude/Longitude data is available...

In [13]:
df_neighborhoods.head(10)

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.7279,-79.262
M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.668,-79.3677
M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol...",43.6435,-79.5772
M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.6864,-79.4
M3J,North York,"Northwood Park, York University",43.768,-79.4873
M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen...",43.6288,-79.521
M8V,Etobicoke,"Humber Bay Shores, Mimico South, New Toronto",43.6056,-79.5013
M5W,Downtown Toronto,Stn A PO Boxes 25 The Esplanade,43.6464,-79.3748
M5E,Downtown Toronto,Berczy Park,43.6448,-79.3733
M6P,West Toronto,"High Park, The Junction South",43.6616,-79.4648


#### Display latitude and longitude values for selected postal codes

In [29]:
# display latitude and longitude values for selected postal codes
df_neighborhoods.loc[['M5G', 'M2H', 'M4B', 'M1J','M4G', 'M4M', 'M1R', 'M9V', 'M9L', 'M5V', 'M1B', 'M5A']]


Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M5G,Downtown Toronto,Central Bay Street,43.658,-79.3874
M2H,North York,Hillcrest Village,43.8038,-79.3635
M4B,East York,"Woodbine Gardens, Parkview Hill",43.7064,-79.3099
M1J,Scarborough,Scarborough Village,43.7447,-79.2395
M4G,East York,Leaside,43.7091,-79.3635
M4M,East Toronto,Studio District,43.6595,-79.3409
M1R,Scarborough,"Maryvale, Wexford",43.7501,-79.2958
M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.7394,-79.5884
M9L,North York,Humber Summit,43.7563,-79.566
M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.6289,-79.3944


In [14]:
# check number of boroughs and neighborhoods in dataframe
print('The dataframe has {} Boroughs and {} Neighborhoods'.format(
       len(df_neighborhoods['Borough'].unique()), df_neighborhoods['Neighborhood'].shape[0]))


The dataframe has 11 Boroughs and 103 Neighborhoods
