In [49]:
import pandas as pd
import numpy as np
import requests
import time
import json
import os
import collections
collections.Callable = collections.abc.Callable
from bs4 import BeautifulSoup

In [2]:
propublica_token=os.environ['propublica_token']

useragent_url = 'https://httpbin.org/user-agent'
r = requests.get(useragent_url)
useragent = json.loads(r.text)['user-agent']
useragent

'python-requests/2.28.1'

In [3]:
headers={'X-API-Key': propublica_token,
         'User-Agent': useragent,
        'From': 'czj9zj@virginis.edu'}

## Goal: Get the text of all bills sponsored by Bob Good in the 117th Congress
### Step 1: Get bob Good's ID number from the propublica members API

In [4]:
root = "https://api.propublica.org"
congress="117"
chamber="house"
endpoint = "/congress/v1/{congress}/{chamber}/members.json".format(congress=congress,chamber=chamber)

r = requests.get(root+endpoint,headers=headers)

myjson = json.loads(r.text)
membersdf = pd.json_normalize(myjson,record_path=["results","members"])
membersdf.head(3).T

Unnamed: 0,0,1,2
id,A000370,A000055,A000371
title,Representative,Representative,Representative
short_title,Rep.,Rep.,Rep.
api_uri,https://api.propublica.org/congress/v1/members...,https://api.propublica.org/congress/v1/members...,https://api.propublica.org/congress/v1/members...
first_name,Alma,Robert,Pete
middle_name,,B.,
last_name,Adams,Aderholt,Aguilar
suffix,,,
date_of_birth,1946-05-27,1965-07-22,1979-06-19
gender,F,M,M


In [5]:
bobgood = membersdf.query("last_name == 'Good'")

In [6]:
bobgoodid = bobgood.reset_index()['id'][0]

## Step 2: Use BG's ID to query the bills API

In [7]:
endpoint = '/congress/v1/members/{memberid}/bills/{billtype}.json'.format(memberid=bobgoodid,billtype='introduced')

r = requests.get(root+endpoint, headers = headers)

myjson = json.loads(r.text)

bgbills1 = pd.json_normalize(myjson, record_path = ['results','bills'])

In [10]:
r = requests.get(root+endpoint, headers = headers, params = {'offset':20})
myjson = json.loads(r.text)
bgbills2 = pd.json_normalize(myjson, record_path = ['results','bills'])

In [11]:
bgbills = pd.concat([bgbills1,bgbills2],ignore_index=True)
bgbills.head(3).T

Unnamed: 0,0,1,2
congress,117,117,117
bill_id,hr8935-117,hr8767-117,hres1297-117
bill_type,hr,hr,hres
number,H.R.8935,H.R.8767,H.RES.1297
bill_uri,https://api.propublica.org/congress/v1/117/bil...,https://api.propublica.org/congress/v1/117/bil...,https://api.propublica.org/congress/v1/117/bil...
title,To amend the Labor-Management Reporting and Di...,To establish a private right of action for par...,"Designating the week beginning November 7, 202..."
short_title,To amend the Labor-Management Reporting and Di...,Empowering Parents Act,"Designating the week beginning November 7, 202..."
sponsor_title,Rep.,Rep.,Rep.
sponsor_id,G000595,G000595,G000595
sponsor_name,Robert Good,Robert Good,Robert Good


In [44]:
urltoscrape = bgbills['congressdotgov_url'][11]
urltoscrape

'https://www.congress.gov/bill/117th-congress/house-bill/5731'

In [23]:
urltoscrape = bgbills['congressdotgov_url'][11] + '/text?format=txt'
urltoscrape

'https://www.congress.gov/bill/117th-congress/house-bill/5731/text?format=txt'

In [24]:
r = requests.get(urltoscrape, headers = {'User_Agent': useragent, 'From': 'czj9zj@virginia.edu'})

myhtml = BeautifulSoup(r.text, 'html.parser')

In [33]:
myhtml.find_all('h3', "currentVersion")[0]['class']

['currentVersion']

In [34]:
myhtml.find_all('h3', "currentVersion")[0].span

<span>Introduced in House (10/26/2021)</span>

In [35]:
myhtml.find_all('h3', "currentVersion")[0].text

'Shown Here:Introduced in House (10/26/2021)'

In [41]:
print(myhtml.find_all('pre')[0].text)

[Congressional Bills 117th Congress]
[From the U.S. Government Publishing Office]
[H.R. 5731 Introduced in House (IH)]








117th CONGRESS
  1st Session
                                H. R. 5731

 To provide that no Federal funds may be expended to implement certain 
         law enforcement partnerships, and for other purposes.


_______________________________________________________________________


                    IN THE HOUSE OF REPRESENTATIVES

                            October 26, 2021

Mr. Good of Virginia (for himself, Mr. Gosar, Mrs. Boebert, Mr. Duncan, 
 Mr. Perry, Mrs. Miller of Illinois, Mr. Cawthorn, Mr. Buck, Mr. Weber 
   of Texas, and Mr. Cloud) introduced the following bill; which was 
               referred to the Committee on the Judiciary

_______________________________________________________________________

                                 A BILL


 
 To provide that no Federal funds may be expended to implement certain 
         law enforcement pa

In [56]:
for i in range(1,10):
    print('https://www.congress.gov/bill/117th-congress/house-bill/{i}/text?format=txt'.format(i=i))

https://www.congress.gov/bill/117th-congress/house-bill/1/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/2/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/3/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/4/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/5/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/6/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/7/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/8/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/9/text?format=txt


In [63]:
# We want to write a function that takes in one of this URl and gives the output which is the doc text
# All we need to build a spider is to make a list of URls

def scrape_one_bill(url):
    time.sleep(2)  # We went to the robox.txt url and saw the time lap
    print('Now getting the text from ' + url)
    r = requests.get(url, headers = {'User_Agent': useragent, 'From': 'czj9zj@virginia.edu'})
    myhtml = BeautifulSoup(r.text, 'html.parser')
    try:
        billtext = myhtml.find_all('pre')[0].text
        return billtext
    except:
        pass

In [64]:
urllist = ['https://www.congress.gov/bill/117th-congress/house-bill/{i}/text?format=txt'.format(i=i) for i in range(3,13)]
urllist

['https://www.congress.gov/bill/117th-congress/house-bill/3/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/4/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/5/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/6/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/7/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/8/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/9/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/10/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/11/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/12/text?format=txt']

In [68]:
bills = [scrape_one_bill(u) for u in urllist]
print(bills[1])

Now getting the text from https://www.congress.gov/bill/117th-congress/house-bill/3/text?format=txt
Now getting the text from https://www.congress.gov/bill/117th-congress/house-bill/4/text?format=txt
Now getting the text from https://www.congress.gov/bill/117th-congress/house-bill/5/text?format=txt
Now getting the text from https://www.congress.gov/bill/117th-congress/house-bill/6/text?format=txt
Now getting the text from https://www.congress.gov/bill/117th-congress/house-bill/7/text?format=txt
Now getting the text from https://www.congress.gov/bill/117th-congress/house-bill/8/text?format=txt
Now getting the text from https://www.congress.gov/bill/117th-congress/house-bill/9/text?format=txt
Now getting the text from https://www.congress.gov/bill/117th-congress/house-bill/10/text?format=txt
Now getting the text from https://www.congress.gov/bill/117th-congress/house-bill/11/text?format=txt
Now getting the text from https://www.congress.gov/bill/117th-congress/house-bill/12/text?format=t