### Capstone - Webscraping
https://www.census.gov/quickfacts/fact/table/US#
"Source: Vintage 2021 Population Estimates Program.  Estimates are not comparable to other geographic levels due to methodology differences that may exist between different data sources."

In [1]:
from bs4 import BeautifulSoup as BS
from IPython.core.display import HTML
import pandas as pd
import requests
import re
import pprint

In [2]:
URL = 'https://www.census.gov/quickfacts/fact/table/US/PST045221'

response = requests.get(URL)

In [3]:
response.status_code

200

In [4]:
response.text

'<!DOCTYPE html>\n<html lang="en" prefix="fb: http://www.facebook.com/2008/fbml og: http://opengraphprotocol.org/schema/">\n<head>\n<!-- build 2505 --><!--[if lt IE 9]><script language="javascript" type="text/javascript" src="//html5shim.googlecode.com/svn/trunk/html5.js"></script><![endif]--><meta charset="utf-8">\n<meta name="viewport" content="width=device-width, initial-scale=1">\n<meta name="apple-mobile-web-app-capable" content="yes">\n<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">\n<meta name="mobile-web-app-capable" content="yes">\n<meta name="keywords" content="United States,Age and Sex,Race and Hispanic Origin,Population Characteristics,Housing,Families &amp; Living Arrangements,Computer and Internet Use,Education,Health,Economy,Transportation,Income &amp; Poverty,Businesses,Geography">\n<meta name="title" content="U.S. Census Bureau QuickFacts: United States">\n<meta name="description" content="Frequently requested statistics for: United Stat

In [5]:
soup = BS(response.text, 'html.parser')

In [6]:
perc_race = soup.find('tbody', attrs = {'data-topic': 'Race and Hispanic Origin', })('tr', attrs = {'class': 'fact', })
perc_race

[<tr class="fact" data-mnemonic="RHI125221" data-precision="1" data-unit="PCT" data-url="/quickfacts/fact/table/US/RHI125221#RHI125221">
 <td>
 <a class="quickinfo icon-info-circled-1" data-title="White alone, percent" href="/quickfacts/note/RHI125221" id="RHI125221" tabindex="-1" title="Quick Info"></a><span>White alone, percent</span>
 </td>
 <td data-geoid="00" data-isnumeric="1" data-srcnote="true" data-value="75.8">
 <span data-title="United States"></span><div class="qf-sourcenote">
 <span></span><a rel="nofollow" title="Source: Vintage 2021 Population Estimates Program.  Estimates are not comparable to other geographic levels due to methodology differences that may exist between different data sources."></a>
 </div>75.8%</td>
 </tr>,
 <tr class="fact" data-mnemonic="RHI225221" data-precision="1" data-unit="PCT" data-url="/quickfacts/fact/table/US/RHI225221#RHI225221">
 <td>
 <a class="quickinfo icon-info-circled-1" data-title="Black or African American alone, percent" href="/q

In [7]:
perc_race[0]

<tr class="fact" data-mnemonic="RHI125221" data-precision="1" data-unit="PCT" data-url="/quickfacts/fact/table/US/RHI125221#RHI125221">
<td>
<a class="quickinfo icon-info-circled-1" data-title="White alone, percent" href="/quickfacts/note/RHI125221" id="RHI125221" tabindex="-1" title="Quick Info"></a><span>White alone, percent</span>
</td>
<td data-geoid="00" data-isnumeric="1" data-srcnote="true" data-value="75.8">
<span data-title="United States"></span><div class="qf-sourcenote">
<span></span><a rel="nofollow" title="Source: Vintage 2021 Population Estimates Program.  Estimates are not comparable to other geographic levels due to methodology differences that may exist between different data sources."></a>
</div>75.8%</td>
</tr>

In [8]:
perc_race[0].findAll('a')[0]['data-title']

'White alone, percent'

In [9]:
perc_race_list = [x.findAll('a')[0]['data-title'] for x in perc_race]
perc_race_list

['White alone, percent',
 'Black or African American alone, percent',
 'American Indian and Alaska Native alone, percent',
 'Asian alone, percent',
 'Native Hawaiian and Other Pacific Islander alone, percent',
 'Two or More Races, percent',
 'Hispanic or Latino, percent',
 'White alone, not Hispanic or Latino, percent']

In [10]:
race_list = [s.replace(', percent', '') for s in perc_race_list]
race_list

['White alone',
 'Black or African American alone',
 'American Indian and Alaska Native alone',
 'Asian alone',
 'Native Hawaiian and Other Pacific Islander alone',
 'Two or More Races',
 'Hispanic or Latino',
 'White alone, not Hispanic or Latino']

In [11]:
perc_race[0].findAll('td')[1]['data-value']

'75.8'

In [12]:
perc_list = [x.findAll('td')[1]['data-value'] for x in perc_race]
perc_list

['75.8', '13.6', '1.3', '6.1', '0.3', '2.9', '18.9', '59.3']

In [13]:
pop_race_perc_2021_df = pd.DataFrame({'race': race_list, 'percent': perc_list})
pop_race_perc_2021_df

Unnamed: 0,race,percent
0,White alone,75.8
1,Black or African American alone,13.6
2,American Indian and Alaska Native alone,1.3
3,Asian alone,6.1
4,Native Hawaiian and Other Pacific Islander alone,0.3
5,Two or More Races,2.9
6,Hispanic or Latino,18.9
7,"White alone, not Hispanic or Latino",59.3


pop_race_perc_2021_df.to_csv('../data/pop_race_perc_2021.csv', index = False)

In [14]:
URL = 'https://www.hourly.io/post/minimum-wage-by-state#:~:text=Which%20States%20Have%20a%20%2415,we%20get%20to%20mid%2D2023.'

response = requests.get(URL)

In [15]:
response.status_code

200

In [16]:
soup1 = BS(response.text)
print(soup1.prettify())

<!DOCTYPE html>
<!-- Last Published: Wed May 10 2023 01:12:57 GMT+0000 (Coordinated Universal Time) -->
<html data-wf-domain="www.hourly.io" data-wf-page="6188f211a7d6f5faf0d2036b" data-wf-site="5e4d02fd9692e694f86e0e98" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Minimum Wage by State for 2023 and Beyond - Hourly, Inc.
  </title>
  <meta content="Twenty-two states and Washington, D.C., raised their minimum wage on Jan. 1, 2023. Connecticut, Oregon, Nevada, and Florida will follow suit later in the year." name="description"/>
  <meta content="Minimum Wage by State for 2023 and Beyond - Hourly, Inc." property="og:title"/>
  <meta content="Twenty-two states and Washington, D.C., raised their minimum wage on Jan. 1, 2023. Connecticut, Oregon, Nevada, and Florida will follow suit later in the year." property="og:description"/>
  <meta content="https://assets-global.website-files.com/5e6aa7798a5728055c457ebb/64090ce9653edfd10a35dbcf_tn-Minimum%20Wages%20by%20state.jpg" propert

In [17]:
images = soup1.findAll('img')
print(type(images))
images

<class 'bs4.element.ResultSet'>


[<img alt="Official logo of Hourly, Inc." class="nav-band_logo" src="https://assets-global.website-files.com/5e4d02fd9692e694f86e0e98/5e4d071cd11c744ad7418ab5_Hourly-logo.svg"/>,
 <img alt="" class="nav-dropdown-link_icon" src="https://assets-global.website-files.com/5e4d02fd9692e694f86e0e98/61f9002cbe5693482119d6e2_Payroll_Icon.svg"/>,
 <img alt="" class="nav-dropdown-link_icon" src="https://assets-global.website-files.com/5e4d02fd9692e694f86e0e98/61f9003d722466412ce33a9e_Pricing_Icon.svg"/>,
 <img alt="" class="nav-dropdown-link_icon" src="https://assets-global.website-files.com/5e4d02fd9692e694f86e0e98/61f90051da1162d61d8dfb7b_Reviews_Icon.svg"/>,
 <img alt="" class="nav-dropdown-link_icon" src="https://assets-global.website-files.com/5e4d02fd9692e694f86e0e98/61f9014dcd05a46cf7808628_help.svg"/>,
 <img alt="" class="nav-dropdown-link_icon" src="https://assets-global.website-files.com/5e4d02fd9692e694f86e0e98/61f9015f8967ffbb32d8897f_Blog_Icon.svg"/>,
 <img alt="" class="nav-dropdown

In [18]:
images[12]

<img alt="Minimum Wage By State" class="blog-post-hero-image" sizes="(max-width: 991px) 100vw, 66vw" src="https://assets-global.website-files.com/5e6aa7798a5728055c457ebb/64090ce3f87263d2c4483f23_hero-Minimum%20Wages%20by%20state.jpg" srcset="https://assets-global.website-files.com/5e6aa7798a5728055c457ebb/64090ce3f87263d2c4483f23_hero-Minimum%20Wages%20by%20state-p-500.jpg 500w, https://assets-global.website-files.com/5e6aa7798a5728055c457ebb/64090ce3f87263d2c4483f23_hero-Minimum%20Wages%20by%20state-p-800.jpg 800w, https://assets-global.website-files.com/5e6aa7798a5728055c457ebb/64090ce3f87263d2c4483f23_hero-Minimum%20Wages%20by%20state-p-1080.jpg 1080w, https://assets-global.website-files.com/5e6aa7798a5728055c457ebb/64090ce3f87263d2c4483f23_hero-Minimum%20Wages%20by%20state-p-1600.jpg 1600w, https://assets-global.website-files.com/5e6aa7798a5728055c457ebb/64090ce3f87263d2c4483f23_hero-Minimum%20Wages%20by%20state.jpg 1896w"/>

In [19]:
hero_image = images[12]
hero_image['src']

'https://assets-global.website-files.com/5e6aa7798a5728055c457ebb/64090ce3f87263d2c4483f23_hero-Minimum%20Wages%20by%20state.jpg'

In [20]:
states = soup1.findAll('td')
states

[<td>Alabama</td>,
 <td>$7.25 <br/>(no state minimum)</td>,
 <td>$2.13</td>,
 <td>N/A</td>,
 <td>N/A</td>,
 <td>Alaska</td>,
 <td>$10.85</td>,
 <td>$10.85 </td>,
 <td>Jan. 1, 2024</td>,
 <td>Annual <a href="https://labor.alaska.gov/news/2022/news22-17.htm">cost of living adjustment</a></td>,
 <td><a href="https://www.hourly.io/post/minimum-wage-in-arizona">Arizona</a></td>,
 <td>$13.85 </td>,
 <td>$10.85</td>,
 <td>Jan. 1, 2024</td>,
 <td>Annual <a href="https://www.azleg.gov/ars/23/00363.htm">cost of living adjustment</a></td>,
 <td>Arkansas</td>,
 <td>$11.00 (for employers of 4 or more employees)</td>,
 <td>$2.63</td>,
 <td>N/A</td>,
 <td>N/A</td>,
 <td><a href="https://www.hourly.io/post/minimum-wage-california">California</a></td>,
 <td>$15.50</td>,
 <td>$15.50</td>,
 <td>Jan. 1, 2024</td>,
 <td>Annual <a href="https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=LAB&amp;sectionNum=1182.12.">cost of living adjustment</a> </td>,
 <td>Colorado</td>,
 <td>$13.65

In [21]:
states[0].text

'Alabama'

In [22]:
list = [x.text for x in states]
list

['Alabama',
 '$7.25 (no state minimum)',
 '$2.13',
 'N/A',
 'N/A',
 'Alaska',
 '$10.85',
 '$10.85 ',
 'Jan. 1, 2024',
 'Annual cost of living adjustment',
 'Arizona',
 '$13.85 ',
 '$10.85',
 'Jan. 1, 2024',
 'Annual cost of living adjustment',
 'Arkansas',
 '$11.00 (for employers of 4 or more employees)',
 '$2.63',
 'N/A',
 'N/A',
 'California',
 '$15.50',
 '$15.50',
 'Jan. 1, 2024',
 'Annual cost of living adjustment ',
 'Colorado',
 '$13.65',
 '$10.63',
 'Jan. 1, 2024',
 'Annual cost of living adjustment ',
 'Connecticut',
 '$14.00',
 '$6.38 (for hotel and restaurant employees)$8.23 (for bartenders)',
 'June 1, 2023',
 'Will increase to $15.00 per hour ',
 'Delaware',
 '$11.75',
 '$2.23',
 'Jan. 1, 2024',
 'Will increase to $13.25',
 'Florida',
 '$11.00',
 '$7.98 ',
 'Sep. 30, 2023',
 'Will increase to $12.00',
 'Georgia',
 '$7.25 (for FLSA-covered employees)$5.15 (non-FLSA employers)',
 '$2.13',
 'N/A',
 'N/A',
 'Hawaii',
 '$12.00',
 '$11.00',
 'Jan. 1, 2024 ',
 'Will increase to $1

In [23]:
# Split the original list into sub-lists of 5 items each
sub_lists = [list[i:i+5] for i in range(0, len(list), 5)]

# Create a new list of tuples where each tuple contains the five elements of a sub-list
min_wage = pd.DataFrame(sub_lists, columns=['State', 'Minimum Wage', 'Tipped Minimum Wage', 'Effective Date', 'Notes'])

min_wage

Unnamed: 0,State,Minimum Wage,Tipped Minimum Wage,Effective Date,Notes
0,Alabama,$7.25 (no state minimum),$2.13,,
1,Alaska,$10.85,$10.85,"Jan. 1, 2024",Annual cost of living adjustment
2,Arizona,$13.85,$10.85,"Jan. 1, 2024",Annual cost of living adjustment
3,Arkansas,$11.00 (for employers of 4 or more employees),$2.63,,
4,California,$15.50,$15.50,"Jan. 1, 2024",Annual cost of living adjustment
...,...,...,...,...,...
56,New Jersey,$15.00 per hour$15.00 per hour,"Jan. 1, 2024 (employers with six or more emplo...",Rhode Island,$15.00 per hour
57,"Jan. 1, 2025",Virginia,$15.00 per hour,"Jan. 1, 2026",Washington
58,$15.74,California,$15.50,Massachusetts,$15.00
59,New York,"$14.20$15.00 (Long Island, Westchester, and NYC)",New Jersey,$14.13 (For large employers),Connecticut


min_wage.to_csv('../data/min_wage.csv', index = False)