# Medicare Outpatient Hospitals - by Provider and Service

In [1]:
import requests
import pandas as pd
from requests.adapters import HTTPAdapter, Retry
import json

def fetch_medicare_outpatient_by_provider_apc(out_csv='medicare_outpatient_by_provider_apc.csv', timeout=30, limit=None):
    """Fetch Medicare Outpatient Hospitals - by Provider and APC via CMS Data API v1.
    Returns a DataFrame on success or None on failure."""
    api_url = 'https://data.cms.gov/data-api/v1/dataset/ccbc9a44-40d4-46b4-a709-5caa59212e50/data'
    
    # Optional: Add query params for filtering (e.g., by provider or APC; see API docs)
    params = {}
    if limit:
        params['$limit'] = limit  # SODA-style param if supported
    
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=frozenset(['GET']))
    session.mount('https://', HTTPAdapter(max_retries=retries))
    headers = {
        'User-Agent': 'HospiTrack/1.0 (+https://github.com/JZembower/HospiTrack)',
        'Accept': 'application/json'
    }
    
    try:
        print(f'Fetching API: {api_url}')
        if params:
            print(f'With params: {params}')
        r = session.get(api_url, headers=headers, params=params, timeout=timeout)
        r.raise_for_status()
        
        # Check if response is JSON
        try:
            data = r.json()
        except json.JSONDecodeError:
            print("Error: Response is not valid JSON. Check if it's HTML (e.g., error page).")
            print(f"Response preview: {r.text[:200]}")
            return None
        
        if not data:
            print("Warning: No data returned from API.")
            return None
        
        # Assume top-level is a list of dicts (as in sample); adjust if wrapped (e.g., data['data'])
        if isinstance(data, dict) and 'data' in data:
            records = data['data']
        else:
            records = data
        
        df = pd.DataFrame(records)
        if df.empty:
            print("Warning: Parsed DataFrame is empty. API may have no matching data.")
            return None
        
        print(f"Parsed {len(df)} rows with columns: {list(df.columns)}")
        # Preview first few rows
        print("\nFirst 3 rows preview:")
        print(df.head(3).to_string(index=False))
        
        df.to_csv(out_csv, index=False)
        print(f'\nSaved {len(df)} rows to {out_csv}')
        return df
        
    except requests.exceptions.HTTPError as e:
        if r.status_code in [404, 410]:
            print(f'HTTP {r.status_code}: Dataset or endpoint not found. Verify ID: ccbc9a44-40d4-46b4-a709-5caa59212e50')
        print('Request failed:', type(e).__name__, e)
        return None
    except requests.exceptions.RequestException as e:
        print('Connection failed:', type(e).__name__, e)
        return None
    except Exception as e:
        print(f"Unexpected error: {type(e).__name__} {e}")
        return None

# Usage: Run this to fetch and save the data
df = fetch_medicare_outpatient_by_provider_apc()

Fetching API: https://data.cms.gov/data-api/v1/dataset/ccbc9a44-40d4-46b4-a709-5caa59212e50/data
Parsed 1000 rows with columns: ['Rndrng_Prvdr_CCN', 'Rndrng_Prvdr_Org_Name', 'Rndrng_Prvdr_St', 'Rndrng_Prvdr_City', 'Rndrng_Prvdr_State_Abrvtn', 'Rndrng_Prvdr_State_FIPS', 'Rndrng_Prvdr_Zip5', 'Rndrng_Prvdr_RUCA', 'Rndrng_Prvdr_RUCA_Desc', 'APC_Cd', 'APC_Desc', 'Bene_Cnt', 'CAPC_Srvcs', 'Avg_Tot_Sbmtd_Chrgs', 'Avg_Mdcr_Alowd_Amt', 'Avg_Mdcr_Pymt_Amt', 'Outlier_Srvcs', 'Avg_Mdcr_Outlier_Amt']

First 3 rows preview:
Rndrng_Prvdr_CCN           Rndrng_Prvdr_Org_Name        Rndrng_Prvdr_St Rndrng_Prvdr_City Rndrng_Prvdr_State_Abrvtn Rndrng_Prvdr_State_FIPS Rndrng_Prvdr_Zip5 Rndrng_Prvdr_RUCA                                                                               Rndrng_Prvdr_RUCA_Desc APC_Cd                                                APC_Desc Bene_Cnt CAPC_Srvcs Avg_Tot_Sbmtd_Chrgs Avg_Mdcr_Alowd_Amt Avg_Mdcr_Pymt_Amt Outlier_Srvcs Avg_Mdcr_Outlier_Amt
          010001 Southeast Healt

In [2]:
import pandas as pd
import numpy as np

def clean_medicare_outpatient_by_provider_apc(df, out_csv='medicare_outpatient_by_provider_apc_cleaned.csv'):
    """Clean the Medicare Outpatient Hospitals - by Provider and APC dataset."""
    # Step 1: Rename columns
    column_mapping = {
        'Rndrng_Prvdr_CCN': 'provider_ccn',
        'Rndrng_Prvdr_Org_Name': 'provider_org_name',
        'Rndrng_Prvdr_St': 'provider_street_address',
        'Rndrng_Prvdr_City': 'provider_city',
        'Rndrng_Prvdr_State_Abrvtn': 'provider_state_abbreviation',
        'Rndrng_Prvdr_State_FIPS': 'provider_state_fips',
        'Rndrng_Prvdr_Zip5': 'provider_zip5',
        'Rndrng_Prvdr_RUCA': 'provider_ruca_code',
        'Rndrng_Prvdr_RUCA_Desc': 'provider_ruca_description',
        'APC_Cd': 'apc_code',
        'APC_Desc': 'apc_description',
        'Bene_Cnt': 'beneficiary_count',
        'CAPC_Srvcs': 'apc_services',
        'Avg_Tot_Sbmtd_Chrgs': 'average_total_submitted_charges',
        'Avg_Mdcr_Alowd_Amt': 'average_medicare_allowed_amount',
        'Avg_Mdcr_Pymt_Amt': 'average_medicare_payment_amount',
        'Outlier_Srvcs': 'outlier_services',
        'Avg_Mdcr_Outlier_Amt': 'average_medicare_outlier_amount'
    }
    df = df.rename(columns=column_mapping)
    
    # Step 2: Data type conversion
    # String columns
    string_cols = ['provider_ccn', 'provider_state_fips', 'provider_zip5', 
                   'provider_state_abbreviation', 'apc_code']
    for col in string_cols:
        df[col] = df[col].astype(str).str.strip()
    
    # Numeric columns (handle empty strings as NaN)
    numeric_cols = ['beneficiary_count', 'apc_services', 'outlier_services']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')  # Nullable integer
    
    financial_cols = ['average_total_submitted_charges', 'average_medicare_allowed_amount',
                     'average_medicare_payment_amount', 'average_medicare_outlier_amount']
    for col in financial_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').round(2)
    
    # Step 3: Clean string columns
    string_clean_cols = ['provider_org_name', 'provider_street_address', 'provider_city',
                        'provider_ruca_description', 'apc_description']
    for col in string_clean_cols:
        df[col] = df[col].str.strip().str.replace(r'\\\/', '/', regex=True)  # Fix escaped slashes
    
    # Step 4: Validation
    # Check for duplicates
    duplicates = df.duplicated(subset=['provider_ccn', 'apc_code']).sum()
    if duplicates > 0:
        print(f"Warning: Found {duplicates} duplicate rows based on provider_ccn and apc_code.")
    
    # Check for invalid values (e.g., negative counts or payments)
    for col in numeric_cols:
        if (df[col] < 0).any():
            print(f"Warning: Negative values found in {col}.")
    for col in financial_cols:
        if (df[col] < 0).any():
            print(f"Warning: Negative values found in {col}.")
    
    # Step 5: Save cleaned dataset
    df.to_csv(out_csv, index=False)
    print(f"\nCleaned dataset saved to {out_csv}")
    print(f"Rows: {len(df)}, Columns: {list(df.columns)}")
    print("\nFirst 3 rows preview:")
    print(df.head(3).to_string(index=False))
    
    return df

# Usage: Assuming df is loaded from fetch_medicare_outpatient_by_provider_apc
df = fetch_medicare_outpatient_by_provider_apc()  # Run fetch first
df_cleaned = clean_medicare_outpatient_by_provider_apc(df)

Fetching API: https://data.cms.gov/data-api/v1/dataset/ccbc9a44-40d4-46b4-a709-5caa59212e50/data
Parsed 1000 rows with columns: ['Rndrng_Prvdr_CCN', 'Rndrng_Prvdr_Org_Name', 'Rndrng_Prvdr_St', 'Rndrng_Prvdr_City', 'Rndrng_Prvdr_State_Abrvtn', 'Rndrng_Prvdr_State_FIPS', 'Rndrng_Prvdr_Zip5', 'Rndrng_Prvdr_RUCA', 'Rndrng_Prvdr_RUCA_Desc', 'APC_Cd', 'APC_Desc', 'Bene_Cnt', 'CAPC_Srvcs', 'Avg_Tot_Sbmtd_Chrgs', 'Avg_Mdcr_Alowd_Amt', 'Avg_Mdcr_Pymt_Amt', 'Outlier_Srvcs', 'Avg_Mdcr_Outlier_Amt']

First 3 rows preview:
Rndrng_Prvdr_CCN           Rndrng_Prvdr_Org_Name        Rndrng_Prvdr_St Rndrng_Prvdr_City Rndrng_Prvdr_State_Abrvtn Rndrng_Prvdr_State_FIPS Rndrng_Prvdr_Zip5 Rndrng_Prvdr_RUCA                                                                               Rndrng_Prvdr_RUCA_Desc APC_Cd                                                APC_Desc Bene_Cnt CAPC_Srvcs Avg_Tot_Sbmtd_Chrgs Avg_Mdcr_Alowd_Amt Avg_Mdcr_Pymt_Amt Outlier_Srvcs Avg_Mdcr_Outlier_Amt
          010001 Southeast Healt