In [1]:
import json
import requests
import pandas as pd


def print_json_tree(data, indent=0):
    if isinstance(data, dict):
        for key, value in data.items():
            print('  ' * indent + str(key))
            print_json_tree(value, indent + 1)
    elif isinstance(data, list):
        for i, item in enumerate(data):
            print('  ' * indent + str(i))
            print_json_tree(item, indent + 1)
    else:
        print('  ' * indent + str(data))

In [None]:
def get_metadata_schemas_list():
    url = "https://data.healthcare.gov/api/1/metastore/schemas"
    res = requests.get(url)

    data = res.json()
    return list(data.keys())

In [None]:
url = "https://data.healthcare.gov/api/1/metastore/schemas/data-dictionary"
res = requests.get(url)

print_json_tree(res.json())
print(res.json())

In [None]:
url = "https://data.healthcare.gov/api/1/datastore/query/download?format=json"
res = requests.get(url)

print_json_tree(res.json())
print(res.json())

In [None]:
url = "https://data.healthcare.gov/api/1/search/facets"
res = requests.get(url)

print_json_tree(res.json())
print(res.json())

In [None]:
url = "https://data.healthcare.gov/api/1/search/facets"
res = requests.get(url)

print_json_tree(res.json())
print(res.json())

In [None]:
base_url = "https://data.healthcare.gov/api/1/"
page_size = 10 #Maximum page size supported by API

search_url = base_url + f"search"
response = requests.get(search_url)

num_results = int(response.json()['total'])
iterations = num_results // page_size + (num_results % page_size > 0)

def dataset_metadata_handling(response_json, key, item):
    """
    Helper function for __init__ method
    """
    try:
        return response_json[key][item]
    except KeyError:
        return None


datasets = []
searchable_keywords = []
for i in range(iterations):
    search_url = base_url + f"search?page={i+1}&page_size={page_size}"
    response = requests.get(search_url)

    response_json = response.json()['results']
    
    for result in response_json.keys():
        searchable_keywords.extend(response_json[result]['keyword'])
        datasets.append({
            "dataset": result,
            "title": dataset_metadata_handling(response_json, result, 'title'),
            "description": dataset_metadata_handling(response_json, result, 'description'),
            "issue date": dataset_metadata_handling(response_json, result, 'issued'),
            "modified date": dataset_metadata_handling(response_json, result, 'modified')
        })
    
    searchable_keywords = list(set(searchable_keywords))

print(searchable_keywords)
print(datasets)

#This takes about 30 seconds to run. Is that too long or acceptable? Can try to make it faster later.

['QHP Landscape Instructions', 'ffm', 'qualifying health plan', 'QHP', 'Network', 'assisters', 'ethnicity', 'consumer type', 'Rate', 'PY2023', 'healthcare', 'metal level', 'csr', 'household income', 'Transparency in Coverage', 'Individual Market Dental', 'Exchange PUF', 'ECP', 'PY2024', 'rtl', 'federal poverty level', 'PY2025', 'aptc', 'fpl', 'PY2022', 'utility', 'race', 'county', 'SHOP', 'age', 'Individual Market Medical', 'Individual', 'SADP', 'QHP Landscape', 'SHOP Market Dental', 'mlr', 'agents', 'qhp', 'localhelp', 'brokers', 'SHOP Market Medical', 'Plan ID Crosswalk', 'cost sharing reduction', 'Benefits and Cost Sharing', 'Service Area', 'Plan Attributes', 'Marketplace PUF', 'Machine Readable', 'loa', 'advanced premium tax credit', 'Business Rules', '2025', 'rcl']
[{'dataset': 'dkan_dataset/a2tn-wyd5', 'title': 'QHP PY2018 Dent- Indi- Land-10-18-2017', 'description': '2018 Plan Year Individual Dental Landscape', 'issue date': '2017-10-24T14:27:31.000Z', 'modified date': '2019-02-

In [17]:
pd.DataFrame(datasets)

Unnamed: 0,dataset,title,description,issue date,modified date
0,dkan_dataset/a2tn-wyd5,QHP PY2018 Dent- Indi- Land-10-18-2017,2018 Plan Year Individual Dental Landscape,2017-10-24T14:27:31.000Z,2019-02-18T23:24:09.000Z
1,dkan_dataset/e8uy-7rnp,AB Suspension and Termination List,"Access Data Dictionary Here - <a href=""https:/...",2016-05-03T01:01:37+00:00,2025-05-07T13:01:00+00:00
2,dkan_dataset/wb6u-x2ny,AB Registration Completion List,"Access Data Dictionary Here - <a href=""https:/...",2016-03-30T14:54:00+00:00,2025-05-07T12:58:00+00:00
3,dkan_dataset/d89b-9897,2015 Qualifying Health Plan Selections by Meta...,All figures are based on plan selections with ...,2015-06-22T15:13:14.000Z,2015-07-02T14:28:57.000Z
4,dkan_dataset/dvpf-jb7v,2015 Qualifying Health Plan Selections by Race...,All figures are based on plan selections with ...,2015-06-22T15:25:14.000Z,2015-07-02T14:28:01.000Z
...,...,...,...,...,...
310,dkan_dataset/kfuw-rdvm,RY2018 MLR Dataset 12032019,This file contains Medical Loss Ratio data for...,2019-12-13T16:47:17.000Z,2019-12-20T16:06:22.000Z
311,dkan_dataset/mt79-mi5w,RY2011 MLR Dataset 20121206,This file contains Medical Loss Ratio data for...,2013-06-21T00:08:42.000Z,2013-06-21T19:53:39.000Z
312,dkan_dataset/pqqm-izx7,RY2012 MLR Dataset 20130805,This file contains Medical Loss Ratio data for...,2013-08-14T23:07:33.000Z,2019-10-07T23:04:11.000Z
313,dkan_dataset/pszf-56pp,MLR RY2014 Socrata Dataset 10302015,This file contains Medical Loss Ratio data for...,2015-11-17T20:31:20.000Z,2015-11-19T01:43:37.000Z


In [12]:
search_url = base_url + f"search?page={i+1}&page_size={page_size}"
response = requests.get(search_url)

response_json = response.json()

with open('response_1746667031867.json', 'w') as f:
    json.dump(response_json, f)