In [None]:
import requests
import zipfile
import pandas as pd
import io
import re
import sys
import datetime
from bs4 import BeautifulSoup
import pdfplumber
import xml.etree.ElementTree as ET

from IPython.display import display

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
def get_asset_type(asset_code = None) -> str:
    """
    Get the asset type codes from house.gov website.
    Parameters
    ----------
    asset_code : str
        The asset code to get the asset type name for.
    Returns
    -------
    str
        The asset name.
    """
    url = "https://fd.house.gov/reference/asset-type-codes.aspx"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find_all('table')[0]
    html_string = str(table)
    html_io = io.StringIO(html_string)
    df = pd.read_html(html_io)[0]

    df = df[df['Asset Code'] == asset_code]
    asset_name = df['Asset Name'].values[0]
    return asset_name

get_asset_type('4K')

'401K and Other Non-Federal Retirement Accounts'

In [None]:
def get_asset_type_df() -> pd.DataFrame:
    """
    Get the asset type codes from house.gov website.
    Returns
    -------
    pd.DataFrame
        A DataFrame containing the asset codes and their names.
    """
    url = "https://fd.house.gov/reference/asset-type-codes.aspx"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find_all('table')[0]
    html_string = str(table)
    html_io = io.StringIO(html_string)
    df = pd.read_html(html_io)[0]

    return df

get_asset_type()

In [47]:
def get_congress_trading_data() -> pd.DataFrame:
    """
    Downloads the latest financial disclosure data from the House of Representatives
    and returns a DataFrame with the data.
    """

    file_path = 'data/congress/'
    current_year = datetime.datetime.now().year
    current_fd = str(current_year) + "FD"

    # Define the URL of the zip file
    url = "https://disclosures-clerk.house.gov/public_disc/financial-pdfs/" + current_fd + ".zip"

    # Send a GET request to download the zip file
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        print("Failed to download the file")
        sys.exit()

    # Load the zip file into memory
    zip_file = zipfile.ZipFile(io.BytesIO(response.content))

    # Initialize lists to store data
    txt_data = []
    xml_data = []

    # Extract the TXT file
    txt_file_name = current_fd + ".txt"
    with zip_file.open(txt_file_name) as txt_file:
        for line in txt_file:
            txt_data.append(line.decode("utf-8").strip().split("\t"))

    # Extract the XML file
    xml_file_name = current_fd + ".xml"
    with zip_file.open(xml_file_name) as xml_file:
        tree = ET.parse(xml_file)
        root = tree.getroot()

        for trade in root.findall('.//Member'):
            trade_data = {child.tag: child.text for child in trade}
            xml_data.append(trade_data)

    # Create DataFrames
    txt_df = pd.DataFrame(txt_data[1:], columns=txt_data[0])
    txt_df.reset_index(drop=True, inplace=True)

    # Remove index 



    xml_df = pd.DataFrame(xml_data)

    # Save the DataFrames to CSV files
    txt_df.to_csv(file_path + current_fd + ".csv", index=False)

    return txt_df

In [133]:
def get_congress_members() -> list:
    """
    Get the members of Congress from the House of Representatives website.
    Returns
    -------
    dict
        A dictionary containing the members of Congress.
    """
    congress_data = get_congress_trading_data()

    congress_members = congress_data['Last'].unique().tolist()
    congress_members = congress_members[1:]

    return congress_members

In [130]:
def get_doc_ids(trade_list) -> str:
    """
    Get the document IDs from the trade list.
    Parameters
    ----------
    trade_list : list
        The list of trades.
    Returns
    -------
    str
        The document IDs.
    """
    doc_ids = []
    for trade in trade_list:
        if trade['DocID'] == None:
            raise ValueError("DocID is missing, some members of Congress do not have a DocID for downloadable PDFs.")
        else:
            doc_id = trade['DocID']
        doc_ids.append(doc_id)
    return doc_ids

In [244]:
def download_and_parse_pdf(doc_id) -> pd.DataFrame:
    """
    """

    file_path = 'data/congress/'
    current_year = datetime.datetime.now().year
    pdf_file_name = doc_id + ".pdf"

    # Define the URL of the zip file
    url = "https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/" + str(current_year) + '/' + pdf_file_name

    # Send a GET request to download the zip file
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        print("Failed to download the file")
        sys.exit()

    # Use the pdfplumber library to extract text from the PDF


    # Create the pdf file
    with open(file_path + 'pdf/' + pdf_file_name, 'wb') as pdf_file:
        pdf_file.write(response.content)

    # Open the PDF file
    with pdfplumber.open(file_path + 'pdf/' + pdf_file_name) as pdf:
        pdf_text = "".join(page.extract_text() for page in pdf.pages)

    # Extract structured data
    pdf_data = []
    owner_types = ["SP", "DC", "JT"]
    lines = pdf_text.splitlines()

    i = 0
    while i < len(lines):
        line = lines[i]
        if any(line.startswith(owner_type) for owner_type in owner_types):
            columns = line.split()
            owner = columns[0]
            asset = " ".join(columns[1:-6])
            ticker = columns[-6].removeprefix("(").removesuffix(")")
            transaction_type = columns[-5]
            transaction_date = columns[-4]
            notification_date = columns[-3]
            amount = "".join(columns[-2:-1])

            # Check the next line for additional amount information
            if i + 1 < len(lines) and lines[i + 1].strip().startswith("[OP]"):
                additional_amount = lines[i + 1].strip().split()[-1]
                amount += " " + additional_amount
                i += 1  # Skip the next line as it has been processed

            pdf_data.append([
                owner, asset, ticker, transaction_type, transaction_date,
                notification_date, amount
            ])
        i += 1


    # Convert to DataFrame
    pdf_df = pd.DataFrame(pdf_data, columns=[
        "Owner", "Asset", "Ticker", "Transaction Type",
        "Transaction Date", "Notification Date", "Amount"
    ])


    return pdf_df

In [246]:
congress_data = get_congress_trading_data()

member = get_congress_members()
# print(member[540])

# Filter for trades by a specific member
trades = congress_data[congress_data['Last'] == member[540]]

# Create a manifest of DocIDs
manifest = get_doc_ids(trades.to_dict('records'))
# display(manifest)

download_and_parse_pdf(manifest[1])


Unnamed: 0,Owner,Asset,Ticker,Transaction Type,Transaction Date,Notification Date,Amount
0,SP,"Forge Investments, LLC",[AB],P,03/04/2024,03/04/2024,"$1,000,001"


In [195]:
congress_data = get_congress_trading_data()

member = get_congress_members()
# print(member[540])

# Filter for trades by a specific member
trades = congress_data[congress_data['Last'] == member[540]]

# Create a manifest of DocIDs
manifest = get_doc_ids(trades.to_dict('records'))
# display(manifest)

download_and_parse_pdf(manifest[0])


Unnamed: 0,Transaction ID,Owner,Asset,Transaction Type,Transaction Date,Notification Date,Amount,Capital Gains > $200?
0,SP,Palo,"Alto Networks, Inc.",(PANW),P,02/12/2024,"02/12/2024 $500,001",-
1,SP,Palo,"Alto Networks, Inc.",(PANW),P,02/21/2024,"02/21/2024 $100,001",-


In [183]:
# List of congress members to download data for
insider_traders = ['Pelosi', 'Mast', 'McCarthy', 'Schumer', 'McConnell', 'AOC', 'Boebert', 'Cruz', 'Hawley', 'Sanders', 'Warren', 'Harris', 'Biden']
insider_traders

['Pelosi',
 'Mast',
 'McCarthy',
 'Schumer',
 'McConnell',
 'AOC',
 'Boebert',
 'Cruz',
 'Hawley',
 'Sanders',
 'Warren',
 'Harris',
 'Biden']