In [18]:
import os
import xmltodict
import json

In [19]:
files_to_download = [
    ['hr', 'https://www.govinfo.gov/bulkdata/BILLSTATUS/118/hr/BILLSTATUS-118-hr.zip'],
    ['s', 'https://www.govinfo.gov/bulkdata/BILLSTATUS/118/s/BILLSTATUS-118-s.zip'],
]


bash_command = ' && '.join([
    f'rm -rf ./{bill_type} \
    && mkdir -p ./{bill_type} \
    && wget -P ./{bill_type} "{file_url}" \
    && unzip ./{bill_type}/{file_url.split("/")[-1]} -d ./{bill_type} \
    && rm ./{bill_type}/{file_url.split("/")[-1]} \
    || echo "Failed to download or unzip {file_url}"' \
    for bill_type, file_url in files_to_download
])

In [20]:
! {bash_command}

--2024-04-24 20:54:26--  https://www.govinfo.gov/bulkdata/BILLSTATUS/118/hr/BILLSTATUS-118-hr.zip
Resolving www.govinfo.gov (www.govinfo.gov)... 2606:4700:4400::ac40:9244, 2606:4700:4400::6812:29bc, 172.64.146.68, ...
Connecting to www.govinfo.gov (www.govinfo.gov)|2606:4700:4400::ac40:9244|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 25277770 (24M) [application/zip]
Saving to: ‘./hr/BILLSTATUS-118-hr.zip’


2024-04-24 20:54:55 (875 KB/s) - ‘./hr/BILLSTATUS-118-hr.zip’ saved [25277770/25277770]

Archive:  ./hr/BILLSTATUS-118-hr.zip
  inflating: ./hr/BILLSTATUS-118hr184.xml  
  inflating: ./hr/BILLSTATUS-118hr802.xml  
  inflating: ./hr/BILLSTATUS-118hr363.xml  
  inflating: ./hr/BILLSTATUS-118hr183.xml  
  inflating: ./hr/BILLSTATUS-118hr6978.xml  
  inflating: ./hr/BILLSTATUS-118hr1503.xml  
  inflating: ./hr/BILLSTATUS-118hr215.xml  
  inflating: ./hr/BILLSTATUS-118hr182.xml  
  inflating: ./hr/BILLSTATUS-118hr208.xml  
  inflating: ./hr/BILLSTATUS-118hr1

In [21]:
def convert_folder_xml_to_newline_json(folder):
    
    for subfolder in os.listdir(folder):
        if os.path.isdir(subfolder):
            subfolder_path = os.path.join(folder, subfolder)
            files = os.listdir(subfolder_path)

            json_objects = []
            xml_files = []

            for filename in files:
                if filename.endswith(".xml"):
                    xml_file = os.path.join(subfolder_path, filename)
                    try:
                        with open(xml_file, "r") as f:
                            xml_content = f.read()
                            json_object = xmltodict.parse(xml_content)
                            # only keep needed fields
                            json_object_parsed = enforce_schema(json_data=json_object['billStatus'])
                            json_objects.append(json_object_parsed)
                            xml_files.append(xml_file)
                    except:
                        # logging.info(f'Failed to convert data for {xml_file}')
                        raise Exception(f'failed to convert data for {xml_file}')
                    
            batch_size = 250
            batch_count = 0

            for i in range(0, len(json_objects), batch_size):
                # Get the current batch
                batch = json_objects[i:i + batch_size]

                # Convert batch to newline-separated JSON
                batch_json_str = "\n".join([json.dumps(obj) for obj in batch])

                # Output the batch to a new file
                output_file = f'{subfolder_path}/{subfolder_path}_bill_status_{batch_count}.json'
                with open(output_file, 'w') as f:
                    f.write(batch_json_str)
                
                # Increment the batch count
                batch_count += 1

            # Remove original xml files
            for xml_file in xml_files:
                if os.path.exists(xml_file):
                    try:
                        os.remove(xml_file)
                    except Exception as e:
                        print(f"Error deleting {xml_file}")
                else:
                    print(f'{xml_file} not found.')

    

In [22]:
def enforce_schema(json_data):
    conform_data = {
        "bill": {
            "number": json_data.get("bill", {}).get("number"),
            "updateDate": json_data.get("bill", {}).get("updateDate"),
            "type": json_data.get("bill", {}).get("type"),
            "introducedDate": json_data.get("bill", {}).get("introducedDate"),
            "congress": json_data.get("bill", {}).get("congress"),
            "committees": {
                "item": [item_dict(item) for item in ensure_list(get_item_if_exists(json_data.get("bill", {}).get("committees", {})))]
            },
            "actions": {
                "item": [action_dict(action) for action in ensure_list(get_item_if_exists(json_data.get("bill", {}).get("actions", {})))]
            },
            "sponsors": {
                "item": [sponsor_dict(sponsor) for sponsor in ensure_list(get_item_if_exists(json_data.get("bill", {}).get("sponsors", {})))]
            },
            "cosponsors": {
                "count": json_data.get("bill", {}).get("cosponsors", {}).get("count"),
                "item": [cosponsor_dict(cosponsor) for cosponsor in ensure_list(get_item_if_exists(json_data.get("bill", {}).get("cosponsors", {})))]
            },
            "policyArea": {
                "name": json_data.get("bill", {}).get("policyArea", {}).get("name")
            },
            "subjects": {
                "legislativeSubjects": {
                    "item": [subject_dict(subject) for subject in ensure_list(get_item_if_exists(json_data.get("bill", {}).get("subjects", {}).get("legislativeSubjects", {})))]
                }
            },
            "title": json_data.get("bill", {}).get("title"),
            "latestAction": {
                "actionDate": json_data.get("bill", {}).get("latestAction", {}).get("actionDate"),
                "text": json_data.get("bill", {}).get("latestAction", {}).get("text")
            }
        }
    }

    return conform_data


def item_dict(item):
    if isinstance(item, dict):
        return {
            "name": item.get("name"),
            "chamber": item.get("chamber"),
            "type": item.get("type")
        }
    return {}


def action_dict(action):
    if isinstance(action, dict):
        return {
            "actionDate": action.get("actionDate"),
            "text": action.get("text"),
            "type": action.get("type"),
            "actionCode": action.get("actionCode"),
            "recordedVotes": [vote_dict(vote) for vote in ensure_list(action.get("recordedVotes", {}).get("recordedVote", []))]
        }
    return {}


def sponsor_dict(sponsor):
    if isinstance(sponsor, dict):
        return {
            "bioguideID": sponsor.get("bioguideID"),
            "fulName": sponsor.get("fulName"),
            "firstName": sponsor.get("firstName"),
            "lastName": sponsor.get("lastName")
        }
    return {}


def cosponsor_dict(cosponsor):
    if isinstance(cosponsor, dict):
        return {
            "bioguideID": cosponsor.get("bioguideID")
        }
    return {}


def subject_dict(subject):
    if isinstance(subject, dict):
        return {
            "name": subject.get("name")
        }
    return {}


def vote_dict(vote):
    if isinstance(vote, dict):
        return {
            "rollNumber": vote.get("rollNumber"),
            "url": vote.get("url"),
            "chamber": vote.get("chamber"),
            "congress": vote.get("congress"),
            "date": vote.get("date"),
            "sessionNumber": vote.get("sessionNumber")
        }
    return {}


def ensure_list(item):
    if isinstance(item, list):
        return item
    elif item:
        return [item]
    return []

def get_item_if_exists(input_data):
    if input_data:
        return input_data.get("item", [])
    else:
        return []

In [23]:
convert_folder_xml_to_newline_json('./')

In [None]:
! cat 's/s_bill_status_1.json'

In [9]:
with open('./s/BILLSTATUS-118s1057.xml', "r") as f:
    xml_content = f.read()
    json_object = xmltodict.parse(xml_content)

In [10]:
json_object

{'billStatus': {'version': '3.0.0',
  'bill': {'number': '1057',
   'updateDate': '2024-04-17T23:52:11Z',
   'updateDateIncludingText': '2024-04-17T23:52:11Z',
   'originChamber': 'Senate',
   'originChamberCode': 'S',
   'type': 'S',
   'introducedDate': '2023-03-29',
   'congress': '118',
   'committees': {'item': {'systemCode': 'ssas00',
     'name': 'Armed Services Committee',
     'chamber': 'Senate',
     'type': 'Standing',
     'activities': {'item': {'name': 'Referred to',
       'date': '2023-03-29T20:33:09Z'}}}},
   'relatedBills': {'item': {'title': 'Further Strengthening Supply Chains for Servicemembers and Security Act of 2023',
     'congress': '118',
     'number': '5151',
     'type': 'HR',
     'latestAction': {'actionDate': '2023-08-04',
      'text': 'Referred to the House Committee on Armed Services.'},
     'relationshipDetails': {'item': {'type': 'Identical bill',
       'identifiedBy': 'CRS'}}}},
   'actions': {'item': [{'actionDate': '2023-03-29',
      'commit