# Capstone Project Web Scraping
James Cage, May 2019.

For Cousera Applied Data Science Capstone Class.

This notebook converts an online table of population by Georgia county to a pandas dataframe

In [2]:
# Import necessary libraries

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [13]:
# Scrape web pages

# List of sources

GA_12 = ("https://us-places.com/Georgia/population-by-County.htm",
         "https://us-places.com/Georgia/asian-total-population-comparison.htm",
         "https://us-places.com/Georgia/asian-percentage-population-comparison.htm")
GA_19 = "http://worldpopulationreview.com/us-counties/ga/"

# Get HTML
res = requests.get(GA_12)

soup = BeautifulSoup(res.content)      # Can also use faster parser: BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[1]      # Find the table on the page (usually 0 or 1)
df_post = pd.read_html(str(table))[0]  # Convert the text representation of the table to a pandas dataframe

print('Shape: ' + str(df_post.shape))   # This is not the final shape - reference only
df_post.head(5)

Shape: (160, 2)


Unnamed: 0,0,1
0,County,Total Population
1,Fulton,977129
2,Gwinnett,840575
3,DeKalb,707401
4,Cobb,707277


In [21]:
# Get data from Georgia Governor's website

GA_demographics = "https://opb.georgia.gov/sites/opb.georgia.gov/files/related_files/site_page/County%20Population%20by%20Race%202016.xlsx"
column_names = ["County", "unused1", "Total", "unused2", "White", "African American", "American Indian", "Asian", "Islander", "Mixed"]
df_georgia = pd.read_excel(GA_demographics, skiprows=2)
df_georgia.columns=column_names
df_georgia.drop(['unused1','unused2'], axis=1, inplace=True)
df_georgia.head()

Unnamed: 0,County,Total,White,African American,American Indian,Asian,Islander,Mixed
0,"Appling County, Georgia",18428.0,14318.0,3577.0,104.0,169.0,35.0,225.0
1,"Atkinson County, Georgia",8273.0,6370.0,1450.0,113.0,78.0,91.0,171.0
2,"Bacon County, Georgia",11372.0,9173.0,1886.0,39.0,70.0,22.0,182.0
3,"Baker County, Georgia",3150.0,1619.0,1445.0,12.0,36.0,2.0,36.0
4,"Baldwin County, Georgia",45144.0,24467.0,19142.0,155.0,800.0,31.0,549.0


In [50]:
# Assign column names to match lab
df_post.columns = ['PostalCode','Borough', 'Neighborhood'] 

# Get rid of rows that do not contain a borough
df_post = df_post[df_post['Borough'] != 'Not assigned']

print('Shape: ' + str(df_post.shape))   # This is not the final shape - reference only
df_post.head(2)

Shape: (211, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village


In [56]:
# Create a set of all unique Postcode / Borough combinations
# Use a set to eliminate any duplicates

area_set = set()
for i, row in df_post.iterrows():
    
    # While we are here, fix the case where the borough is assigned but the neighborhood is not
    if row['Borough'] != 'Not assigned' and row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']
        
    # Now add the PostalCode / Borough combination to the set
    area_set.add((row['PostalCode'], row['Borough']))

print(len(area_set))   # The number of unique pairs of post codes and assigned boroughs 
df_post[(df_post['PostalCode'] == 'M7A') & (df_post['Borough'] == "Queen's Park")]   # Check to be sure "Not Assigned" neighborhood is fixed

103


Unnamed: 0,PostalCode,Borough,Neighborhood
8,M7A,Queen's Park,Queen's Park


In [52]:
# Create a blank dataframe to hold the results
df_toronto = pd.DataFrame(columns=df_post.columns)

# Iterate through the set to fill in new dataframe
for i in area_set:
    bacon_zip_code, village = i[0], i[1]
    hoods = ', '.join(list(df_post[(df_post['PostalCode'] == bacon_zip_code) & (df_post['Borough'] == village)]['Neighborhood']))
    row_dict = {'PostalCode': [bacon_zip_code],
                'Borough': [village],
                'Neighborhood': [hoods]}
    a_row = pd.DataFrame.from_dict(row_dict)
    df_toronto = df_toronto.append(a_row)    
    
df_toronto.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5R,Central Toronto,"The Annex, North Midtown, Yorkville"
0,M6G,Downtown Toronto,Christie
0,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre"
0,M6M,York,"Del Ray, Keelesdale, Mount Dennis, Silverthorn"
0,M5B,Downtown Toronto,"Ryerson, Garden District"
0,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
0,M3B,North York,Don Mills North
0,M9A,Etobicoke,Islington Avenue
0,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
0,M4Y,Downtown Toronto,Church and Wellesley


In [60]:
# Save as CSV for use in next notebook
with open ('df_toronto.csv', 'w') as f:
    f.write(df_toronto.sort_values(by=['PostalCode']).to_csv(index=False))

In [53]:
# Confirm the size of the resulting dataframe
df_toronto.shape

(103, 3)