# Capstone Project Web Scraping
James Cage, May 2019.

For Cousera Applied Data Science Capstone Class.

This notebook converts an online table of population by Georgia county to a pandas dataframe

In [1]:
# Import necessary libraries
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# uncomment the following line to install geopy if needed
#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

import requests
from bs4 import BeautifulSoup

print('Libraries imported.')

Libraries imported.


In [2]:
# Get data from Georgia Governor's website

GA_demographics = "https://opb.georgia.gov/sites/opb.georgia.gov/files/related_files/site_page/County%20Population%20by%20Race%202016.xlsx"

# Read the spreadsheet, skipping the first 3 rows
df_georgia = pd.read_excel(GA_demographics, skiprows=range(2), skipfooter=2)

# Assign column names
df_georgia.columns = ["County", "unused1", "Total", "unused2", "White", "African American", "American Indian", "Asian", "Islander", "Mixed"]

# Drop unused columns
df_georgia.drop(['unused1','unused2'], axis=1, inplace=True)

# Only include county name in 'County' column
df_georgia['County'] = df_georgia['County'].str.split(" County, Georgia", n = 1, expand = True)

df_georgia.head()

Unnamed: 0,County,Total,White,African American,American Indian,Asian,Islander,Mixed
0,Appling,18428,14318,3577,104,169,35,225
1,Atkinson,8273,6370,1450,113,78,91,171
2,Bacon,11372,9173,1886,39,70,22,182
3,Baker,3150,1619,1445,12,36,2,36
4,Baldwin,45144,24467,19142,155,800,31,549


In [0]:
# Drop columns not needed for this analysis

df_georgia = df_georgia[['County', 'Total', 'Asian']]

df_georgia['Asian Percent'] = 100 * df_georgia['Asian'] / df_georgia['Total']

In [36]:
!wget -q -O 'georgia.json' https://github.com/JamesDCage/Final-Week-0/raw/master/map.json
print('Data downloaded!')
with open('georgia.json') as json_data:
    georgia_data = json.load(json_data)

Data downloaded!


In [45]:
df_georgia.sort_values(by='Asian', ascending=False).head(10)

Unnamed: 0,County,Total,Asian,Asian Percent,Log_Asian
66,Gwinnett,907135,107947,11.899772,107947
59,Fulton,1023336,71301,6.967506,71301
43,DeKalb,740321,47412,6.404249,47412
32,Cobb,748150,39862,5.328076,39862
57,Forsyth,221009,26484,11.983222,26484
30,Clayton,279462,14786,5.29088,14786
24,Chatham,289082,8334,2.882919,8334
74,Henry,221768,7459,3.363425,7459
35,Columbia,147450,6371,4.320787,6371
28,Clarke,124707,5513,4.420762,5513


In [47]:
# Add log information 

df_georgia['Log_Asian'] = df_georgia['Asian'].log()

AttributeError: ignored

In [46]:
df_georgia.sort_values(by='Asian', ascending=False).head(10)


Unnamed: 0,County,Total,Asian,Asian Percent,Log_Asian
66,Gwinnett,907135,107947,11.899772,107947
59,Fulton,1023336,71301,6.967506,71301
43,DeKalb,740321,47412,6.404249,47412
32,Cobb,748150,39862,5.328076,39862
57,Forsyth,221009,26484,11.983222,26484
30,Clayton,279462,14786,5.29088,14786
24,Chatham,289082,8334,2.882919,8334
74,Henry,221768,7459,3.363425,7459
35,Columbia,147450,6371,4.320787,6371
28,Clarke,124707,5513,4.420762,5513


# Choropleth

# New Section

In [42]:
# This section uses Folium's choropleth class, instead of the depreciated
# choropleth method.

georgia_geo = r'georgia.json'

map_center = [32.837610, -83.483272]

map_georgia = folium.Map(location=map_center, zoom_start=7)

folium.Choropleth(
    geo_data=georgia_geo,
    data=df_georgia,
    columns=['County', 'Asian'],
    key_on='feature.properties.NAME',
    # threshold_scale=threshold_scale,
    fill_color='YlGn', 
    fill_opacity=0.9, 
    line_opacity=0.8,
    legend_name='Asian Population by County in Georgia',
    # reset=True,
    bins=[0, 4000, 8000, 16000, 32000, 64000, 110000],
    highlight=True
).add_to(map_georgia)

map_georgia

# Foursquare Data Exploration

In [0]:
# Scrape web pages

# List of sources

GA_12 = ("https://us-places.com/Georgia/population-by-County.htm",
         "https://us-places.com/Georgia/asian-total-population-comparison.htm",
         "https://us-places.com/Georgia/asian-percentage-population-comparison.htm")
GA_19 = "http://worldpopulationreview.com/us-counties/ga/"

# Get HTML
res = requests.get(GA_12)

soup = BeautifulSoup(res.content)      # Can also use faster parser: BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[1]      # Find the table on the page (usually 0 or 1)
df_post = pd.read_html(str(table))[0]  # Convert the text representation of the table to a pandas dataframe

print('Shape: ' + str(df_post.shape))   # This is not the final shape - reference only
df_post.head(5)

Shape: (160, 2)


Unnamed: 0,0,1
0,County,Total Population
1,Fulton,977129
2,Gwinnett,840575
3,DeKalb,707401
4,Cobb,707277


In [0]:
# Create a set of all unique Postcode / Borough combinations
# Use a set to eliminate any duplicates

area_set = set()
for i, row in df_post.iterrows():
    
    # While we are here, fix the case where the borough is assigned but the neighborhood is not
    if row['Borough'] != 'Not assigned' and row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']
        
    # Now add the PostalCode / Borough combination to the set
    area_set.add((row['PostalCode'], row['Borough']))

print(len(area_set))   # The number of unique pairs of post codes and assigned boroughs 
df_post[(df_post['PostalCode'] == 'M7A') & (df_post['Borough'] == "Queen's Park")]   # Check to be sure "Not Assigned" neighborhood is fixed

103


Unnamed: 0,PostalCode,Borough,Neighborhood
8,M7A,Queen's Park,Queen's Park


In [0]:
# Create a blank dataframe to hold the results
df_toronto = pd.DataFrame(columns=df_post.columns)

# Iterate through the set to fill in new dataframe
for i in area_set:
    bacon_zip_code, village = i[0], i[1]
    hoods = ', '.join(list(df_post[(df_post['PostalCode'] == bacon_zip_code) & (df_post['Borough'] == village)]['Neighborhood']))
    row_dict = {'PostalCode': [bacon_zip_code],
                'Borough': [village],
                'Neighborhood': [hoods]}
    a_row = pd.DataFrame.from_dict(row_dict)
    df_toronto = df_toronto.append(a_row)    
    
df_toronto.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5R,Central Toronto,"The Annex, North Midtown, Yorkville"
0,M6G,Downtown Toronto,Christie
0,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre"
0,M6M,York,"Del Ray, Keelesdale, Mount Dennis, Silverthorn"
0,M5B,Downtown Toronto,"Ryerson, Garden District"
0,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
0,M3B,North York,Don Mills North
0,M9A,Etobicoke,Islington Avenue
0,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
0,M4Y,Downtown Toronto,Church and Wellesley


In [0]:
# Save as CSV for use in next notebook
with open ('df_toronto.csv', 'w') as f:
    f.write(df_toronto.sort_values(by=['PostalCode']).to_csv(index=False))

In [0]:
# Confirm the size of the resulting dataframe
df_toronto.shape

(103, 3)