In [1]:
import re
from bs4 import BeautifulSoup
import requests
from typing import Dict, Set

In [2]:
url = "https://www.house.gov/representatives"
text = requests.get(url).text
soup = BeautifulSoup(text, "html5lib")

all_urls = [a['href']
                for a in soup('a')
                if a.has_attr('href')]

print(len(all_urls))

967


In [3]:
regex = r"^https?://.*\.house\.gov/?$"

assert re.match(regex, 'http://joel.house.gov')
assert re.match(regex, 'https://joel.house.gov')
assert re.match(regex, "http://joel.house.gov/")
assert re.match(regex, "https://joel.house.gov/")
assert not re.match(regex, 'joel.house.gov')
assert not re.match(regex, "http://joel.house.com")
assert not re.match(regex, "https://joel.house.gov/biography")

good_urls = [url for url in all_urls if re.match(regex, url)]

print(len(good_urls))

870


In [4]:
good_urls = list(set(good_urls))

print(len(good_urls))

435


In [5]:
html = requests.get('https://jayapal.house.gov').text
soup = BeautifulSoup(html, 'html5lib')

links = {a['href'] for a in soup('a') if 'press releases'
                                                            in a.text.lower()}
print(links)

{'https://jayapal.house.gov/category/news/', 'https://jayapal.house.gov/category/press-releases/'}


In [20]:
press_releases: Dict[str, Set[str]] = {}
    
for house_url in good_urls:
    html = requests.get(house_url).text
    soup = BeautifulSoup(html, 'html5lib')
    pr_links = {a['href'] for a in soup('a') if 'press releases'
                                                                   in a.text.lower()}
    print(f"{house_url}: {pr_links}")
    press_releases[house_url] = pr_links

https://khanna.house.gov: {'/media/press-releases'}
https://castro.house.gov: {'https://castro.house.gov/media-center/press-releases'}
https://malliotakis.house.gov: {'/media/press-releases'}
https://wenstrup.house.gov: {'/news/documentquery.aspx?DocumentTypeID=2491'}
https://guest.house.gov: {'/media/press-releases'}
https://higgins.house.gov: {'/press-releases/archived-press-releases-2008', '/media-center/press-releases'}
https://sewell.house.gov/: {'/frontpage?qt-home_page_tabs=0#qt-home_page_tabs', '/media-center/press-releases'}
https://nehls.house.gov: {'/media/press-releases'}
https://eshoo.house.gov/: {'/media/press-releases'}
https://sherrill.house.gov/: {'/media/press-releases'}
https://gonzalez.house.gov: {'/media/press-releases'}
https://cartwright.house.gov: {'/news/documentquery.aspx?DocumentTypeID=2442'}
https://beatty.house.gov: {'/media-center/press-releases'}
https://watsoncoleman.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://kevinmccarthy.house.g

https://degette.house.gov: {'/newsroom/press-releases'}
https://turner.house.gov/: {'/frontpage?qt-home_page_tabs=0#qt-home_page_tabs', '/media-center/press-releases'}
https://huizenga.house.gov/: {'/News/DocumentQuery.aspx?DocumentTypeID=2041'}
https://ross.house.gov: {'/media/press-releases'}
https://cole.house.gov: {'/media-center/press-releases'}
https://adams.house.gov: {'/media-center/press-releases'}
https://loudermilk.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://costa.house.gov/: {'/media-center/press-releases'}
https://sarajacobs.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://perlmutter.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://ebjohnson.house.gov: {'/media-center/press-releases'}
https://cloud.house.gov: set()
https://jones.house.gov: {'/media/press-releases'}
https://mikerogers.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://maloney.house.gov/: {'/news/press-releases'}
https://fleischmann.hou

https://norton.house.gov/: {'/media-center/press-releases'}
https://dankildee.house.gov: {'/media/press-releases'}
https://fletcher.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://panetta.house.gov: {'/media/press-releases'}
https://huffman.house.gov: set()
https://fernandez.house.gov: {'/media/press-releases'}
https://speier.house.gov/: {'/press-releases'}
https://mcclain.house.gov: {'/media/press-releases'}
https://pocan.house.gov: {'/media-center/press-releases'}
https://sessions.house.gov: {'/media/press-releases'}
https://mchenry.house.gov: {'/news/documentquery.aspx?DocumentTypeID=418'}
https://susielee.house.gov: {'/media/press-releases'}
https://salazar.house.gov: {'/media/press-releases'}
https://escobar.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://mikethompson.house.gov/: {'/newsroom/press-releases'}
https://morgangriffith.house.gov/: {'/News/DocumentQuery.aspx?DocumentTypeID=2235'}
https://mann.house.gov: {'/media/press-releases'}
https

https://marymiller.house.gov: {'/media/subscribe-press-releases', '/media/press-releases'}
https://budd.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://phillips.house.gov/: {'/media/press-releases'}
https://golden.house.gov: {'/media/press-releases'}
https://meijer.house.gov: {'/media/press-releases'}
https://vanduyne.house.gov: {'/media/press-releases'}
https://pence.house.gov/: {'/media/press-releases'}
https://zeldin.house.gov/: {'/media-center/press-releases'}
https://garretgraves.house.gov/: {'/media-center/press-releases'}
https://aguilar.house.gov/: {'/media-center/press-releases'}
https://pallone.house.gov: {'/media/press-releases'}
https://timmons.house.gov/: {'/media/press-releases'}
https://davids.house.gov/: {'/media/press-releases'}
https://newhouse.house.gov: {'/media-center/press-releases'}
https://webster.house.gov/: {'/press-releases'}
https://rice.house.gov: {'/press-releases'}
https://massie.house.gov: set()
https://westerman.house.gov/: set()
https

In [22]:
def paragraph_mentions(text: str, keyword: str) ->bool:
    '''Return True if tags <p> got the word'''
    soup = BeautifulSoup(text, 'html5lib')
    paragraphs = [p.get_text() for p in soup('p')]
    
    return any(keyword.lower() in paragraph.lower()
                              for paragraph in paragraphs)

In [26]:
text = """<body><h1>Facebook</h1><p>Twitter</p>"""
assert paragraph_mentions(text, 'twitter')
assert not paragraph_mentions(text, 'facebook')

In [30]:
for house_url, pr_links in press_releases.items():
    for pr_link in pr_links:
        url = f"{house_url}/{pr_link}"
        text = requests.get(url).text
        
        if paragraph_mentions(text, 'data'):
            print(f"{house_url}")
            break            

https://mullin.house.gov
https://davidscott.house.gov/
https://kelly.house.gov
https://pascrell.house.gov/
https://boebert.house.gov
https://banks.house.gov
https://ohalleran.house.gov
https://reed.house.gov/
https://panetta.house.gov
https://mcclain.house.gov
https://gomez.house.gov/
https://allred.house.gov/
https://jhb.house.gov/
https://sires.house.gov
https://gwenmoore.house.gov
https://case.house.gov/
https://gibbs.house.gov/
https://anthonygonzalez.house.gov
