In [None]:
CONGRESS_GOV_API_KEY = API_KEY_HERE

In [40]:
import requests
import pandas as pd

def fetch_legislations_metadata():
    # fetch legislation from the web
    metadata_list = []
    iter = 0
    read = 0
    LIMIT = 250
    last_url = f"https://api.congress.gov/v3/bill?api_key={CONGRESS_GOV_API_KEY}&limit={LIMIT}&fromDateTime=2013-01-01T00:00:00Z"
    csv_output = "data/legislation_metadata.csv"
    MAX_ITER = 150

    # create csv file
    with open(csv_output, "w") as f:
        f.write("congress,number,type,updateDate\n")

    while iter < MAX_ITER:
        # get raw metadata
        raw_response = requests.get(last_url).json()
        for bill in raw_response["bills"]:
            metadata_list.append({"congress": bill["congress"], 
                                "number": bill["number"], 
                                "type": bill["type"],
                                "updateDate": bill["updateDate"]})
            read += 1
        if not raw_response["pagination"]["next"]:
            break
        last_url = raw_response["pagination"]["next"] + f"&api_key={CONGRESS_GOV_API_KEY}&limit={LIMIT}"

        print(f"iter: {iter}, updateDate: {metadata_list[-1]['updateDate']}, length: {read}/{raw_response['pagination']['count']}, last_url: {last_url}")

        # append to csv file
        if iter % 10 == 0:
            with open(csv_output, "a", encoding="utf-8") as f:
                f.write(pd.DataFrame(metadata_list).to_csv(index=False, header=False))
            print("saved to csv")
            metadata_list = []

        iter += 1

    # append to csv file
    with open(csv_output, "a") as f:
        if metadata_list:
            f.write(pd.DataFrame(metadata_list).to_csv(index=False, header=False))
            print("saved to csv")

fetch_legislations_metadata()

iter: 0, updateDate: 2023-11-15, length: 250/402558, last_url: https://api.congress.gov/v3/bill?fromDateTime=2013-01-01T00:00:00Z&offset=250&limit=250&format=json&api_key=7xWTDcatLljr9RmkhddIUOKZM99oZ3o1vGCnUbqO&limit=250
saved to csv
iter: 1, updateDate: 2023-11-16, length: 500/402558, last_url: https://api.congress.gov/v3/bill?fromDateTime=2013-01-01T00:00:00Z&offset=500&limit=250&format=json&api_key=7xWTDcatLljr9RmkhddIUOKZM99oZ3o1vGCnUbqO&limit=250
iter: 2, updateDate: 2023-11-14, length: 750/402558, last_url: https://api.congress.gov/v3/bill?fromDateTime=2013-01-01T00:00:00Z&offset=750&limit=250&format=json&api_key=7xWTDcatLljr9RmkhddIUOKZM99oZ3o1vGCnUbqO&limit=250
iter: 3, updateDate: 2023-11-14, length: 1000/402558, last_url: https://api.congress.gov/v3/bill?fromDateTime=2013-01-01T00:00:00Z&offset=1000&limit=250&format=json&api_key=7xWTDcatLljr9RmkhddIUOKZM99oZ3o1vGCnUbqO&limit=250
iter: 4, updateDate: 2023-11-14, length: 1250/402558, last_url: https://api.congress.gov/v3/bill?

In [166]:
import pandas as pd
import multiprocessing as mp
from multiprocessing.pool import ThreadPool as Pool
import tqdm

def legislation_is_valid(legislation_text):
    return legislation_text[:15] != "<!DOCTYPE html>"

def remove_html_tags(legislation_text):
    return legislation_text[18:-21]

def download_legislation(bill_metadata):
    bill_congress, bill_number, bill_type = bill_metadata
    AMENDMENT_TYPES = ("ih", "is", "rh", "es", "rds", "enr", "eh", "ath")
    bill_type = str(bill_type).lower()
    for amendment in AMENDMENT_TYPES[:2]:  # only check ih and is
        legislation_url = f"https://www.congress.gov/{bill_congress}/bills/{bill_type}{bill_number}/BILLS-{bill_congress}{bill_type}{bill_number}{amendment}.htm"
        legislation_text = requests.get(legislation_url).text
        if legislation_is_valid(legislation_text):
            with open(f"data/legislations/{bill_congress}-{bill_type}-{bill_number}-{amendment}.txt", "w", encoding="utf-8") as f:
                f.write(remove_html_tags(legislation_text))
            return
    print("invalid", legislation_url)

def download_legislations():
    LIMIT = 100
    df = pd.read_csv(csv_output)

    pool = Pool(mp.cpu_count() * 3 // 4)
    df_subsection_values = df[0:LIMIT][["congress", "number", "type"]].values

    # download legislation and print progress using mp
    for _ in tqdm.tqdm(
        pool.imap_unordered(download_legislation, df_subsection_values), total=len(df_subsection_values)
        ):
        pass

download_legislations()

100%|██████████| 100/100 [00:00<00:00, 665762.54it/s]
