In [None]:
import os
import requests
import re
import json
import pandas as pd

# Base URL for the Federal Register API
base_url = 'https://www.federalregister.gov/api/v1/documents.json'

# Query Parameters
params = {
    "fields[]": [
        "title",
        "toc_subject",
        "disposition_notes", # fetch notes
        "document_number",
        "executive_order_number",
        "pdf_url",
        "raw_text_url",  # Fetch raw text URL for full body text
        "presidential_document_number",
        "president",  # Include president in the query
        "publication_date",  # Include publication date
        "signing_date",  # Include signing date
        "citation"  # Include citation (EO_Citation)
    ],
    "per_page": 200,  # max 1000
    "order": "newest",
    "conditions[agencies][]": "executive-office-of-the-president",
    "conditions[type][]": "PRESDOCU",
    "conditions[presidential_document_type][]": "executive_order",
    "conditions[president][]": "",
    "page": 1  # Start at page 1
}

# Directory to save downloaded executive orders
download_dir = 'executive_orders'

# Create the directory if it does not exist
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# Function to sanitize filenames by removing invalid characters and truncating to 100 characters
def sanitize_filename(filename):
    sanitized = re.sub(r'[\\/*?:"<>|]', '', filename)  # Remove invalid characters
    return sanitized[:100]  # Truncate to 100 characters

# Function to fetch full text from the raw_text_url
def fetch_full_text(text_url):
    response = requests.get(text_url)
    if response.status_code == 200:
        return response.text
    else:
        print(f'Failed to fetch full text from {text_url}')
        return None

# List to hold all the data
documents_data = []

# Pagination loop for API requests
while True:
    response = requests.get(base_url, params=params)
    data = response.json()

    # Break if there are no more results
    if 'results' not in data or not data['results']:
        break

    # Loop through each document and organize data
    for document in data['results']:
        title = document['title']
        toc_subject = document.get('toc_subject')
        disposition_notes = document.get('disposition_notes')
        pdf_url = document.get('pdf_url')
        raw_text_url = document.get('raw_text_url')
        document_number = document.get('document_number')
        executive_order_number = document.get('executive_order_number')
        presidential_document_number = document.get('presidential_document_number')
        president = document.get('president')  # Extract president
        publication_date = document.get('publication_date')  # Extract publication date
        signing_date = document.get('signing_date')  # Extract signing date
        EO_Citation = document.get('citation')  # Extract EO Citation
        
        # Fetch the full body text from the raw text URL
        full_text = fetch_full_text(raw_text_url) if raw_text_url else "No full text available"

        # Organize data into a dictionary
        document_info = {
            "title": title,
            "president": president,
            "publication_date": publication_date,
            "signing_date": signing_date,
            "citation": EO_Citation,
            "document_number": document_number,
            "executive_order_number": executive_order_number,
            "pdf_url": pdf_url,
            "toc_subject": toc_subject,
            "disposition_notes": disposition_notes,
            "full_text": full_text
        }

        # Add to list of documents
        documents_data.append(document_info)

    # Move to the next page
    params["page"] += 1

# Convert list of dictionaries to a pandas DataFrame
df = pd.DataFrame(documents_data)

# Export to JSON
json_file = os.path.join(download_dir, 'executive_orders_rawtext_final.json')
with open(json_file, 'w', encoding='utf-8') as jsonf:
    json.dump(documents_data, jsonf, ensure_ascii=False, indent=4)
print(f'Data exported to JSON: {json_file}')

In [None]:
!pip install pdfplumber