# Medicare Inpatient By Provider Service Scraping:

In [None]:
import requests
import pandas as pd
from requests.adapters import HTTPAdapter, Retry
import json  # For pretty-printing if needed

def fetch_medicare_inpatient_by_provider_service(out_csv='medicare_inpatient_by_provider_service.csv', timeout=30, limit=None):
    """Fetch Medicare Inpatient Hospitals - by Provider and Service via CMS Data API v1.
    Returns a DataFrame on success or None on failure."""
    api_url = 'https://data.cms.gov/data-api/v1/dataset/690ddc6c-2767-4618-b277-420ffb2bf27c/data'
    
    # Optional: Add query params for filtering (e.g., by provider CCN or DRG; see API docs)
    params = {}
    if limit:
        params['$limit'] = limit  # SODA-style param if supported
    
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=frozenset(['GET']))
    session.mount('https://', HTTPAdapter(max_retries=retries))
    headers = {
        'User-Agent': 'HospiTrack/1.0 (+https://github.com/imamazahoor/HospiTrack)',
        'Accept': 'application/json'  # Ensure JSON response
    }
    
    try:
        print(f'Fetching API: {api_url}')
        if params:
            print(f'With params: {params}')
        r = session.get(api_url, headers=headers, params=params, timeout=timeout)
        r.raise_for_status()
        
        # Check if response is JSON
        try:
            data = r.json()
        except json.JSONDecodeError:
            print("Error: Response is not valid JSON. Check if it's HTML (e.g., error page).")
            print(f"Response preview: {r.text[:200]}")
            return None
        
        if not data:
            print("Warning: No data returned from API.")
            return None
        
        # Assume top-level is a list of dicts (as in sample); adjust if wrapped (e.g., data['data'])
        if isinstance(data, dict) and 'data' in data:
            records = data['data']
        else:
            records = data
        
        df = pd.DataFrame(records)
        if df.empty:
            print("Warning: Parsed DataFrame is empty. API may have no matching data.")
            return None
        
        print(f"Parsed {len(df)} rows with columns: {list(df.columns)}")
        # Preview first few rows
        print("\nFirst 3 rows preview:")
        print(df.head(3).to_string(index=False))
        
        df.to_csv(out_csv, index=False)
        print(f'\nSaved {len(df)} rows to {out_csv}')
        return df
        
    except requests.exceptions.HTTPError as e:
        if r.status_code in [404, 410]:
            print(f'HTTP {r.status_code}: Dataset or endpoint not found. Verify ID: 690ddc6c-2767-4618-b277-420ffb2bf27c')
        print('Request failed:', type(e).__name__, e)
        return None
    except requests.exceptions.RequestException as e:
        print('Connection failed:', type(e).__name__, e)
        return None
    except Exception as e:
        print(f"Unexpected error: {type(e).__name__} {e}")
        return None

# Usage: Run this to fetch and save the data
df = fetch_medicare_inpatient_by_provider_service()

Fetching API: https://data.cms.gov/data-api/v1/dataset/690ddc6c-2767-4618-b277-420ffb2bf27c/data
Parsed 1000 rows with columns: ['Rndrng_Prvdr_CCN', 'Rndrng_Prvdr_Org_Name', 'Rndrng_Prvdr_City', 'Rndrng_Prvdr_St', 'Rndrng_Prvdr_State_FIPS', 'Rndrng_Prvdr_Zip5', 'Rndrng_Prvdr_State_Abrvtn', 'Rndrng_Prvdr_RUCA', 'Rndrng_Prvdr_RUCA_Desc', 'DRG_Cd', 'DRG_Desc', 'Tot_Dschrgs', 'Avg_Submtd_Cvrd_Chrg', 'Avg_Tot_Pymt_Amt', 'Avg_Mdcr_Pymt_Amt']

First 3 rows preview:
Rndrng_Prvdr_CCN           Rndrng_Prvdr_Org_Name Rndrng_Prvdr_City        Rndrng_Prvdr_St Rndrng_Prvdr_State_FIPS Rndrng_Prvdr_Zip5 Rndrng_Prvdr_State_Abrvtn Rndrng_Prvdr_RUCA                                                                               Rndrng_Prvdr_RUCA_Desc DRG_Cd                                                                                 DRG_Desc Tot_Dschrgs Avg_Submtd_Cvrd_Chrg Avg_Tot_Pymt_Amt Avg_Mdcr_Pymt_Amt
          010001 Southeast Health Medical Center            Dothan 1108 Ross Clark Circle       

This data only parsed 1000 of the 14,000 rows. I will have to look more into it.

In [2]:
import pandas as pd
import re

def clean_medicare_inpatient_data(input_csv='C:\\Users\\jrzem\\OneDrive\\Semester Classwork\\Graduate\\Fall 2025 CMU\\Data Focused Python\\HospiTrack\\Data and Cleaning\\CMS Data\\medicare_inpatient_by_provider_service.csv', 
                                 output_csv='C:\\Users\\jrzem\\OneDrive\\Semester Classwork\\Graduate\\Fall 2025 CMU\\Data Focused Python\\HospiTrack\\Data and Cleaning\\CMS Data\\medicare_inpatient_cleaned.csv'):
    """Clean the Medicare Inpatient Hospitals by Provider and Service dataset."""
    try:
        # Read the CSV
        df = pd.read_csv(input_csv, low_memory=False)
        print(f"Loaded {len(df)} rows from {input_csv}")
        
        # Step 1: Rename columns to cleaner, standardized names
        column_mapping = {
            'Rndrng_Prvdr_CCN': 'provider_ccn',
            'Rndrng_Prvdr_Org_Name': 'provider_org_name',
            'Rndrng_Prvdr_City': 'provider_city',
            'Rndrng_Prvdr_St': 'provider_street_address',
            'Rndrng_Prvdr_State_FIPS': 'provider_state_fips',
            'Rndrng_Prvdr_Zip5': 'provider_zip5',
            'Rndrng_Prvdr_State_Abrvtn': 'provider_state_abbreviation',
            'Rndrng_Prvdr_RUCA': 'provider_ruca_code',
            'Rndrng_Prvdr_RUCA_Desc': 'provider_ruca_description',
            'DRG_Cd': 'drg_code',
            'DRG_Desc': 'drg_description',
            'Tot_Dschrgs': 'total_discharges',
            'Avg_Submtd_Cvrd_Chrg': 'average_submitted_covered_charge',
            'Avg_Tot_Pymt_Amt': 'average_total_payment_amount',
            'Avg_Mdcr_Pymt_Amt': 'average_medicare_payment_amount'
        }
        df = df.rename(columns=column_mapping)
        print("Columns renamed to:", list(df.columns))
        
        # Step 2: Convert data types
        # String columns (preserve leading zeros)
        string_cols = ['provider_ccn', 'provider_state_fips', 'provider_zip5', 
                      'provider_state_abbreviation', 'drg_code']
        for col in string_cols:
            df[col] = df[col].astype(str).str.strip()
        
        # Integer columns
        df['provider_ruca_code'] = df['provider_ruca_code'].astype(int)
        df['total_discharges'] = df['total_discharges'].astype(int)
        
        # Float columns (round to 2 decimal places for currency-like values)
        float_cols = ['average_submitted_covered_charge', 
                     'average_total_payment_amount', 
                     'average_medicare_payment_amount']
        for col in float_cols:
            df[col] = df[col].astype(float).round(2)
        
        # String cleaning for text columns
        text_cols = ['provider_org_name', 'provider_city', 'provider_street_address', 
                    'provider_ruca_description', 'drg_description']
        for col in text_cols:
            df[col] = df[col].str.strip()
            # Fix escaped characters (e.g., "\u003e" to ">")
            df[col] = df[col].apply(lambda x: re.sub(r'\\u003e', '>', x) if isinstance(x, str) else x)
            df[col] = df[col].apply(lambda x: re.sub(r'\\u003c', '<', x) if isinstance(x, str) else x)
        
        # Step 3: Validate data
        # Check for missing values
        missing = df.isnull().sum()
        if missing.any():
            print("\nMissing values per column:")
            print(missing[missing > 0])
        else:
            print("\nNo missing values detected.")
        
        # Check for duplicates (by provider_ccn and drg_code, as these should be unique)
        duplicates = df.duplicated(subset=['provider_ccn', 'drg_code']).sum()
        if duplicates > 0:
            print(f"\nWarning: Found {duplicates} duplicate rows (based on provider_ccn and drg_code).")
            # Optionally drop duplicates
            # df = df.drop_duplicates(subset=['provider_ccn', 'drg_code'], keep='first')
        else:
            print("\nNo duplicate rows detected.")
        
        # Step 4: Preview cleaned data
        print(f"\nCleaned dataset preview (first 3 rows):")
        print(df.head(3).to_string(index=False))
        
        # Step 5: Save cleaned dataset
        df.to_csv(output_csv, index=False)
        print(f"\nSaved cleaned dataset with {len(df)} rows to {output_csv}")
        return df
    
    except FileNotFoundError:
        print(f"Error: Input file {input_csv} not found.")
        return None
    except pd.errors.EmptyDataError:
        print(f"Error: Input file {input_csv} is empty.")
        return None
    except Exception as e:
        print(f"Unexpected error during cleaning: {type(e).__name__} {e}")
        return None

# Usage: Run the cleaning function
df = clean_medicare_inpatient_data()

Loaded 1000 rows from C:\Users\jrzem\OneDrive\Semester Classwork\Graduate\Fall 2025 CMU\Data Focused Python\HospiTrack\Data and Cleaning\CMS Data\medicare_inpatient_by_provider_service.csv
Columns renamed to: ['provider_ccn', 'provider_org_name', 'provider_city', 'provider_street_address', 'provider_state_fips', 'provider_zip5', 'provider_state_abbreviation', 'provider_ruca_code', 'provider_ruca_description', 'drg_code', 'drg_description', 'total_discharges', 'average_submitted_covered_charge', 'average_total_payment_amount', 'average_medicare_payment_amount']

No missing values detected.

No duplicate rows detected.

Cleaned dataset preview (first 3 rows):
provider_ccn               provider_org_name provider_city provider_street_address provider_state_fips provider_zip5 provider_state_abbreviation  provider_ruca_code                                                                            provider_ruca_description drg_code                                                            

# Medicare Inpatient Hospitals - by Geography and Service

In [3]:
import requests
import pandas as pd
from requests.adapters import HTTPAdapter, Retry
import json

def fetch_medicare_inpatient_drg_national_state(out_csv='medicare_inpatient_drg_national_state.csv', timeout=30, limit=None):
    """Fetch Medicare Inpatient Hospitals - DRG National and State Statistics via CMS Data API v1.
    Returns a DataFrame on success or None on failure."""
    api_url = 'https://data.cms.gov/data-api/v1/dataset/2941ab09-8cee-49d8-9703-f3c5b854e388/data'
    
    # Optional: Add query params for filtering (e.g., by state or DRG; see API docs)
    params = {}
    if limit:
        params['$limit'] = limit  # SODA-style param if supported
    
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=frozenset(['GET']))
    session.mount('https://', HTTPAdapter(max_retries=retries))
    headers = {
        'User-Agent': 'HospiTrack/1.0 (+https://github.com/imamazahoor/HospiTrack)',
        'Accept': 'application/json'
    }
    
    try:
        print(f'Fetching API: {api_url}')
        if params:
            print(f'With params: {params}')
        r = session.get(api_url, headers=headers, params=params, timeout=timeout)
        r.raise_for_status()
        
        # Check if response is JSON
        try:
            data = r.json()
        except json.JSONDecodeError:
            print("Error: Response is not valid JSON. Check if it's HTML (e.g., error page).")
            print(f"Response preview: {r.text[:200]}")
            return None
        
        if not data:
            print("Warning: No data returned from API.")
            return None
        
        # Assume top-level is a list of dicts (as in sample); adjust if wrapped (e.g., data['data'])
        if isinstance(data, dict) and 'data' in data:
            records = data['data']
        else:
            records = data
        
        df = pd.DataFrame(records)
        if df.empty:
            print("Warning: Parsed DataFrame is empty. API may have no matching data.")
            return None
        
        print(f"Parsed {len(df)} rows with columns: {list(df.columns)}")
        # Preview first few rows
        print("\nFirst 3 rows preview:")
        print(df.head(3).to_string(index=False))
        
        df.to_csv(out_csv, index=False)
        print(f'\nSaved {len(df)} rows to {out_csv}')
        return df
        
    except requests.exceptions.HTTPError as e:
        if r.status_code in [404, 410]:
            print(f'HTTP {r.status_code}: Dataset or endpoint not found. Verify ID: 2941ab09-8cee-49d8-9703-f3c5b854e388')
        print('Request failed:', type(e).__name__, e)
        return None
    except requests.exceptions.RequestException as e:
        print('Connection failed:', type(e).__name__, e)
        return None
    except Exception as e:
        print(f"Unexpected error: {type(e).__name__} {e}")
        return None

# Usage: Run this to fetch and save the data
df = fetch_medicare_inpatient_drg_national_state()

Fetching API: https://data.cms.gov/data-api/v1/dataset/2941ab09-8cee-49d8-9703-f3c5b854e388/data
Parsed 1000 rows with columns: ['Rndrng_Prvdr_Geo_Lvl', 'Rndrng_Prvdr_Geo_Cd', 'Rndrng_Prvdr_Geo_Desc', 'DRG_Cd', 'DRG_Desc', 'Tot_Dschrgs', 'Avg_Submtd_Cvrd_Chrg', 'Avg_Tot_Pymt_Amt', 'Avg_Mdcr_Pymt_Amt']

First 3 rows preview:
Rndrng_Prvdr_Geo_Lvl Rndrng_Prvdr_Geo_Cd Rndrng_Prvdr_Geo_Desc DRG_Cd                                                                                 DRG_Desc Tot_Dschrgs Avg_Submtd_Cvrd_Chrg Avg_Tot_Pymt_Amt Avg_Mdcr_Pymt_Amt
            National                                  National    001                              HEART TRANSPLANT OR IMPLANT OF HEART ASSIST SYSTEM WITH MCC        2028         1531532.3373      332916.4857      281516.34517
            National                                  National    002                           HEART TRANSPLANT OR IMPLANT OF HEART ASSIST SYSTEM WITHOUT MCC          49         605138.22449     142682.34694      112367

In [6]:
import requests
import pandas as pd
from requests.adapters import HTTPAdapter, Retry
import json
import re

def fetch_and_clean_medicare_inpatient_drg(out_csv='medicare_inpatient_drg_cleaned.csv', timeout=30, limit=None):
    """Fetch and clean Medicare Inpatient Hospitals - DRG National and State Statistics via CMS Data API v1.
    Returns a cleaned DataFrame and saves to CSV."""
    # API fetch (from previous function)
    api_url = 'https://data.cms.gov/data-api/v1/dataset/2941ab09-8cee-49d8-9703-f3c5b854e388/data'
    params = {}
    if limit:
        params['$limit'] = limit
    
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=frozenset(['GET']))
    session.mount('https://', HTTPAdapter(max_retries=retries))
    headers = {
        'User-Agent': 'HospiTrack/1.0 (+https://github.com/imamazahoor/HospiTrack)',
        'Accept': 'application/json'
    }
    
    try:
        print(f'Fetching API: {api_url}')
        if params:
            print(f'With params: {params}')
        r = session.get(api_url, headers=headers, params=params, timeout=timeout)
        r.raise_for_status()
        
        # Parse JSON
        try:
            data = r.json()
        except json.JSONDecodeError:
            print("Error: Response is not valid JSON. Check if it's HTML (e.g., error page).")
            print(f"Response preview: {r.text[:200]}")
            return None
        
        if not data:
            print("Warning: No data returned from API.")
            return None
        
        records = data if isinstance(data, list) else data.get('data', [])
        df = pd.DataFrame(records)
        if df.empty:
            print("Warning: Parsed DataFrame is empty.")
            return None
        
        print(f"Initial parse: {len(df)} rows with columns: {list(df.columns)}")
        
        # Step 1: Rename columns
        column_mapping = {
            'Rndrng_Prvdr_Geo_Lvl': 'geo_level',
            'Rndrng_Prvdr_Geo_Cd': 'geo_code',
            'Rndrng_Prvdr_Geo_Desc': 'geo_description',
            'DRG_Cd': 'drg_code',
            'DRG_Desc': 'drg_description',
            'Tot_Dschrgs': 'total_discharges',
            'Avg_Submtd_Cvrd_Chrg': 'average_submitted_covered_charge',
            'Avg_Tot_Pymt_Amt': 'average_total_payment_amount',
            'Avg_Mdcr_Pymt_Amt': 'average_medicare_payment_amount'
        }
        df = df.rename(columns=column_mapping)
        
        # Step 2: Standardize data types
        df['geo_level'] = df['geo_level'].astype(str).str.strip()
        df['geo_code'] = df['geo_code'].astype(str).str.strip()
        df['geo_description'] = df['geo_description'].astype(str).str.strip()
        df['drg_code'] = df['drg_code'].astype(str).str.zfill(3)  # Pad DRG codes to 3 digits
        df['drg_description'] = df['drg_description'].astype(str).str.strip()
        df['total_discharges'] = pd.to_numeric(df['total_discharges'], errors='coerce').astype('Int64')  # Nullable integer
        df['average_submitted_covered_charge'] = pd.to_numeric(df['average_submitted_covered_charge'], errors='coerce').round(2)
        df['average_total_payment_amount'] = pd.to_numeric(df['average_total_payment_amount'], errors='coerce').round(2)
        df['average_medicare_payment_amount'] = pd.to_numeric(df['average_medicare_payment_amount'], errors='coerce').round(2)
        
        # Step 3: Clean special characters in drg_description
        df['drg_description'] = df['drg_description'].apply(lambda x: re.sub(r'\u003e', '>', x))  # Replace > (escaped)
        df['drg_description'] = df['drg_description'].apply(lambda x: re.sub(r'\u003c', '<', x))  # Replace < (escaped)
        
        # Step 4: Check for missing values
        missing_counts = df.isna().sum()
        print("\nMissing values per column:")
        print(missing_counts)
        if missing_counts.sum() > 0:
            print("Warning: Missing values detected. Consider filling or dropping if needed.")
            # Example: df = df.fillna({'geo_code': ''}) for empty geo_code in national records
        
        # Step 5: Check for duplicates
        duplicates = df.duplicated(subset=['geo_level', 'geo_code', 'drg_code']).sum()
        print(f"\nDuplicate rows (based on geo_level, geo_code, drg_code): {duplicates}")
        if duplicates > 0:
            df = df.drop_duplicates(subset=['geo_level', 'geo_code', 'drg_code'], keep='first')
            print(f"Dropped {duplicates} duplicate rows.")
        
        # Step 6: Validate data
        invalid_discharges = df[df['total_discharges'] < 0].shape[0]
        invalid_charges = df[df['average_submitted_covered_charge'] < 0].shape[0]
        invalid_payments = df[df['average_medicare_payment_amount'] < 0].shape[0]
        print(f"\nData validation:")
        print(f"Rows with negative discharges: {invalid_discharges}")
        print(f"Rows with negative charges: {invalid_charges}")
        print(f"Rows with negative payments: {invalid_payments}")
        if invalid_discharges > 0 or invalid_charges > 0 or invalid_payments > 0:
            print("Warning: Invalid data detected. Consider filtering or correcting.")
        
        # Step 7: Preview cleaned data
        print(f"\nCleaned DataFrame: {len(df)} rows with columns: {list(df.columns)}")
        print("\nFirst 3 rows preview:")
        print(df.head(3).to_string(index=False))
        
        # Step 8: Save to CSV
        df.to_csv(out_csv, index=False)
        print(f'\nSaved {len(df)} rows to {out_csv}')
        return df
        
    except requests.exceptions.HTTPError as e:
        if r.status_code in [404, 410]:
            print(f'HTTP {r.status_code}: Dataset or endpoint not found. Verify ID: 2941ab09-8cee-49d8-9703-f3c5b854e388')
        print('Request failed:', type(e).__name__, e)
        return None
    except requests.exceptions.RequestException as e:
        print('Connection failed:', type(e).__name__, e)
        return None
    except Exception as e:
        print(f"Unexpected error: {type(e).__name__} {e}")
        return None

# Usage: Run to fetch, clean, and save
df = fetch_and_clean_medicare_inpatient_drg()

Fetching API: https://data.cms.gov/data-api/v1/dataset/2941ab09-8cee-49d8-9703-f3c5b854e388/data
Initial parse: 1000 rows with columns: ['Rndrng_Prvdr_Geo_Lvl', 'Rndrng_Prvdr_Geo_Cd', 'Rndrng_Prvdr_Geo_Desc', 'DRG_Cd', 'DRG_Desc', 'Tot_Dschrgs', 'Avg_Submtd_Cvrd_Chrg', 'Avg_Tot_Pymt_Amt', 'Avg_Mdcr_Pymt_Amt']

Missing values per column:
geo_level                           0
geo_code                            0
geo_description                     0
drg_code                            0
drg_description                     0
total_discharges                    0
average_submitted_covered_charge    0
average_total_payment_amount        0
average_medicare_payment_amount     0
dtype: int64

Duplicate rows (based on geo_level, geo_code, drg_code): 0

Data validation:
Rows with negative discharges: 0
Rows with negative charges: 0
Rows with negative payments: 0

Cleaned DataFrame: 1000 rows with columns: ['geo_level', 'geo_code', 'geo_description', 'drg_code', 'drg_description', 'total_dischar

# Medicare Outpatient Hospitals - by Geography and Service

In [4]:
import requests
import pandas as pd
from requests.adapters import HTTPAdapter, Retry
import json

def fetch_medicare_outpatient_apc_hcpcs_statistics(out_csv='medicare_outpatient_apc_hcpcs_statistics.csv', timeout=30, limit=None):
    """Fetch Medicare Outpatient Hospitals - APC and HCPCS Statistics via CMS Data API v1.
    Returns a DataFrame on success or None on failure."""
    api_url = 'https://data.cms.gov/data-api/v1/dataset/04baec39-4a54-400e-824d-8e75251ceda9/data'
    
    # Optional: Add query params for filtering (e.g., by state or APC; see API docs)
    params = {}
    if limit:
        params['$limit'] = limit  # SODA-style param if supported
    
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=frozenset(['GET']))
    session.mount('https://', HTTPAdapter(max_retries=retries))
    headers = {
        'User-Agent': 'HospiTrack/1.0 (+https://github.com/imamazahoor/HospiTrack)',
        'Accept': 'application/json'
    }
    
    try:
        print(f'Fetching API: {api_url}')
        if params:
            print(f'With params: {params}')
        r = session.get(api_url, headers=headers, params=params, timeout=timeout)
        r.raise_for_status()
        
        # Check if response is JSON
        try:
            data = r.json()
        except json.JSONDecodeError:
            print("Error: Response is not valid JSON. Check if it's HTML (e.g., error page).")
            print(f"Response preview: {r.text[:200]}")
            return None
        
        if not data:
            print("Warning: No data returned from API.")
            return None
        
        # Assume top-level is a list of dicts (as in sample); adjust if wrapped (e.g., data['data'])
        if isinstance(data, dict) and 'data' in data:
            records = data['data']
        else:
            records = data
        
        df = pd.DataFrame(records)
        if df.empty:
            print("Warning: Parsed DataFrame is empty. API may have no matching data.")
            return None
        
        print(f"Parsed {len(df)} rows with columns: {list(df.columns)}")
        # Preview first few rows
        print("\nFirst 3 rows preview:")
        print(df.head(3).to_string(index=False))
        
        df.to_csv(out_csv, index=False)
        print(f'\nSaved {len(df)} rows to {out_csv}')
        return df
        
    except requests.exceptions.HTTPError as e:
        if r.status_code in [404, 410]:
            print(f'HTTP {r.status_code}: Dataset or endpoint not found. Verify ID: 04baec39-4a54-400e-824d-8e75251ceda9')
        print('Request failed:', type(e).__name__, e)
        return None
    except requests.exceptions.RequestException as e:
        print('Connection failed:', type(e).__name__, e)
        return None
    except Exception as e:
        print(f"Unexpected error: {type(e).__name__} {e}")
        return None

# Usage: Run this to fetch and save the data
df = fetch_medicare_outpatient_apc_hcpcs_statistics()

Fetching API: https://data.cms.gov/data-api/v1/dataset/04baec39-4a54-400e-824d-8e75251ceda9/data
Parsed 1000 rows with columns: ['Rndrng_Prvdr_Geo_Lvl', 'Rndrng_Prvdr_Geo_Cd', 'Rndrng_Prvdr_Geo_Desc', 'Srvc_Lvl', 'APC_Cd', 'APC_Desc', 'HCPCS_Cd', 'HCPCS_Desc', 'Bene_Cnt', 'CAPC_Srvcs', 'Avg_Tot_Sbmtd_Chrgs', 'Avg_Mdcr_Alowd_Amt', 'Avg_Mdcr_Pymt_Amt', 'Outlier_Srvcs', 'Avg_Mdcr_Outlier_Amt']

First 3 rows preview:
Rndrng_Prvdr_Geo_Lvl Rndrng_Prvdr_Geo_Cd Rndrng_Prvdr_Geo_Desc Srvc_Lvl APC_Cd                                                APC_Desc HCPCS_Cd HCPCS_Desc Bene_Cnt CAPC_Srvcs Avg_Tot_Sbmtd_Chrgs Avg_Mdcr_Alowd_Amt Avg_Mdcr_Pymt_Amt Outlier_Srvcs Avg_Mdcr_Outlier_Amt
            National                                  National      APC   5072         Level 2 Excision/ Biopsy/ Incision and Drainage                       316770     334117     10679.628242442    1476.3803965078    1170.006942987          1246      5783.2769337079
            National                             

In [7]:
import requests
import pandas as pd
from requests.adapters import HTTPAdapter, Retry
import json
import re

def fetch_and_clean_medicare_outpatient_apc_hcpcs(out_csv='medicare_outpatient_apc_hcpcs_statistics_cleaned.csv', timeout=30, limit=None):
    """Fetch and clean Medicare Outpatient Hospitals - APC and HCPCS Statistics via CMS Data API v1.
    Returns a cleaned DataFrame and saves to CSV."""
    api_url = 'https://data.cms.gov/data-api/v1/dataset/04baec39-4a54-400e-824d-8e75251ceda9/data'
    
    # Column mapping for renaming
    column_mapping = {
        'Rndrng_Prvdr_Geo_Lvl': 'geo_level',
        'Rndrng_Prvdr_Geo_Cd': 'geo_code',
        'Rndrng_Prvdr_Geo_Desc': 'geo_description',
        'Srvc_Lvl': 'service_level',
        'APC_Cd': 'apc_code',
        'APC_Desc': 'apc_description',
        'HCPCS_Cd': 'hcpcs_code',
        'HCPCS_Desc': 'hcpcs_description',
        'Bene_Cnt': 'beneficiary_count',
        'CAPC_Srvcs': 'apc_services',
        'Avg_Tot_Sbmtd_Chrgs': 'average_total_submitted_charges',
        'Avg_Mdcr_Alowd_Amt': 'average_medicare_allowed_amount',
        'Avg_Mdcr_Pymt_Amt': 'average_medicare_payment_amount',
        'Outlier_Srvcs': 'outlier_services',
        'Avg_Mdcr_Outlier_Amt': 'average_medicare_outlier_amount'
    }
    
    # Fetch data
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=frozenset(['GET']))
    session.mount('https://', HTTPAdapter(max_retries=retries))
    headers = {
        'User-Agent': 'HospiTrack/1.0 (+https://github.com/imamazahoor/HospiTrack)',
        'Accept': 'application/json'
    }
    
    params = {}
    if limit:
        params['$limit'] = limit
    
    try:
        print(f'Fetching API: {api_url}')
        if params:
            print(f'With params: {params}')
        r = session.get(api_url, headers=headers, params=params, timeout=timeout)
        r.raise_for_status()
        
        # Parse JSON
        try:
            data = r.json()
        except json.JSONDecodeError:
            print("Error: Response is not valid JSON. Check if it's HTML (e.g., error page).")
            print(f"Response preview: {r.text[:200]}")
            return None
        
        if not data:
            print("Warning: No data returned from API.")
            return None
        
        # Handle JSON structure
        if isinstance(data, dict) and 'data' in data:
            records = data['data']
        else:
            records = data
        
        df = pd.DataFrame(records)
        if df.empty:
            print("Warning: Parsed DataFrame is empty. API may have no matching data.")
            return None
        
        # Step 1: Rename columns
        df = df.rename(columns=column_mapping)
        
        # Step 2: Convert data types
        # String columns
        string_columns = ['geo_level', 'geo_code', 'geo_description', 'service_level', 'apc_code', 'apc_description', 'hcpcs_code', 'hcpcs_description']
        for col in string_columns:
            if col in df.columns:
                df[col] = df[col].astype(str).str.strip().replace('', None)
        
        # Numeric columns (counts)
        count_columns = ['beneficiary_count', 'apc_services', 'outlier_services']
        for col in count_columns:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
        
        # Numeric columns (financial)
        financial_columns = [
            'average_total_submitted_charges',
            'average_medicare_allowed_amount',
            'average_medicare_payment_amount',
            'average_medicare_outlier_amount'
        ]
        for col in financial_columns:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce').round(2)
        
        # Step 3: Clean strings (e.g., unescape characters)
        if 'apc_description' in df.columns:
            df['apc_description'] = df['apc_description'].str.replace(r'\\u003e', '>', regex=True).str.replace(r'\\u002f', '/', regex=True)
        if 'hcpcs_description' in df.columns:
            df['hcpcs_description'] = df['hcpcs_description'].str.replace(r'\\u003e', '>', regex=True).str.replace(r'\\u002f', '/', regex=True)
        
        # Step 4: Handle missing/empty values
        df = df.replace('', None)  # Convert empty strings to None
        missing_summary = df.isnull().sum()
        if missing_summary.any():
            print("\nMissing values per column:")
            print(missing_summary[missing_summary > 0])
        
        # Step 5: Validation
        # Check for duplicates
        duplicate_check = df.duplicated(subset=['geo_level', 'geo_code', 'apc_code', 'hcpcs_code']).sum()
        if duplicate_check > 0:
            print(f"Warning: Found {duplicate_check} duplicate rows based on geo_level, geo_code, apc_code, and hcpcs_code.")
        
        # Validate non-negative numeric values
        for col in count_columns + financial_columns:
            if col in df.columns:
                invalid = df[df[col] < 0].shape[0]
                if invalid > 0:
                    print(f"Warning: {invalid} rows with negative values in {col}.")
        
        # Step 6: Output results
        print(f"\nParsed and cleaned {len(df)} rows with columns: {list(df.columns)}")
        print("\nFirst 3 rows preview:")
        print(df.head(3).to_string(index=False))
        
        df.to_csv(out_csv, index=False)
        print(f'\nSaved {len(df)} rows to {out_csv}')
        return df
        
    except requests.exceptions.HTTPError as e:
        if r.status_code in [404, 410]:
            print(f'HTTP {r.status_code}: Dataset or endpoint not found. Verify ID: 04baec39-4a54-400e-824d-8e75251ceda9')
        print('Request failed:', type(e).__name__, e)
        return None
    except requests.exceptions.RequestException as e:
        print('Connection failed:', type(e).__name__, e)
        return None
    except Exception as e:
        print(f"Unexpected error: {type(e).__name__} {e}")
        return None

# Usage: Run this to fetch, clean, and save the data
df = fetch_and_clean_medicare_outpatient_apc_hcpcs()

Fetching API: https://data.cms.gov/data-api/v1/dataset/04baec39-4a54-400e-824d-8e75251ceda9/data

Missing values per column:
geo_code                             70
hcpcs_code                         1000
hcpcs_description                  1000
average_total_submitted_charges      70
average_medicare_allowed_amount      70
average_medicare_payment_amount      70
average_medicare_outlier_amount     447
dtype: int64

Parsed and cleaned 1000 rows with columns: ['geo_level', 'geo_code', 'geo_description', 'service_level', 'apc_code', 'apc_description', 'hcpcs_code', 'hcpcs_description', 'beneficiary_count', 'apc_services', 'average_total_submitted_charges', 'average_medicare_allowed_amount', 'average_medicare_payment_amount', 'outlier_services', 'average_medicare_outlier_amount']

First 3 rows preview:
geo_level geo_code geo_description service_level apc_code                                         apc_description hcpcs_code hcpcs_description  beneficiary_count  apc_services  average_total

# Medicare Outpatient Hospitals - by Provider and Service

In [5]:
import requests
import pandas as pd
from requests.adapters import HTTPAdapter, Retry
import json

def fetch_medicare_outpatient_by_provider_apc(out_csv='medicare_outpatient_by_provider_apc.csv', timeout=30, limit=None):
    """Fetch Medicare Outpatient Hospitals - by Provider and APC via CMS Data API v1.
    Returns a DataFrame on success or None on failure."""
    api_url = 'https://data.cms.gov/data-api/v1/dataset/ccbc9a44-40d4-46b4-a709-5caa59212e50/data'
    
    # Optional: Add query params for filtering (e.g., by provider or APC; see API docs)
    params = {}
    if limit:
        params['$limit'] = limit  # SODA-style param if supported
    
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=frozenset(['GET']))
    session.mount('https://', HTTPAdapter(max_retries=retries))
    headers = {
        'User-Agent': 'HospiTrack/1.0 (+https://github.com/imamazahoor/HospiTrack)',
        'Accept': 'application/json'
    }
    
    try:
        print(f'Fetching API: {api_url}')
        if params:
            print(f'With params: {params}')
        r = session.get(api_url, headers=headers, params=params, timeout=timeout)
        r.raise_for_status()
        
        # Check if response is JSON
        try:
            data = r.json()
        except json.JSONDecodeError:
            print("Error: Response is not valid JSON. Check if it's HTML (e.g., error page).")
            print(f"Response preview: {r.text[:200]}")
            return None
        
        if not data:
            print("Warning: No data returned from API.")
            return None
        
        # Assume top-level is a list of dicts (as in sample); adjust if wrapped (e.g., data['data'])
        if isinstance(data, dict) and 'data' in data:
            records = data['data']
        else:
            records = data
        
        df = pd.DataFrame(records)
        if df.empty:
            print("Warning: Parsed DataFrame is empty. API may have no matching data.")
            return None
        
        print(f"Parsed {len(df)} rows with columns: {list(df.columns)}")
        # Preview first few rows
        print("\nFirst 3 rows preview:")
        print(df.head(3).to_string(index=False))
        
        df.to_csv(out_csv, index=False)
        print(f'\nSaved {len(df)} rows to {out_csv}')
        return df
        
    except requests.exceptions.HTTPError as e:
        if r.status_code in [404, 410]:
            print(f'HTTP {r.status_code}: Dataset or endpoint not found. Verify ID: ccbc9a44-40d4-46b4-a709-5caa59212e50')
        print('Request failed:', type(e).__name__, e)
        return None
    except requests.exceptions.RequestException as e:
        print('Connection failed:', type(e).__name__, e)
        return None
    except Exception as e:
        print(f"Unexpected error: {type(e).__name__} {e}")
        return None

# Usage: Run this to fetch and save the data
df = fetch_medicare_outpatient_by_provider_apc()

Fetching API: https://data.cms.gov/data-api/v1/dataset/ccbc9a44-40d4-46b4-a709-5caa59212e50/data
Parsed 1000 rows with columns: ['Rndrng_Prvdr_CCN', 'Rndrng_Prvdr_Org_Name', 'Rndrng_Prvdr_St', 'Rndrng_Prvdr_City', 'Rndrng_Prvdr_State_Abrvtn', 'Rndrng_Prvdr_State_FIPS', 'Rndrng_Prvdr_Zip5', 'Rndrng_Prvdr_RUCA', 'Rndrng_Prvdr_RUCA_Desc', 'APC_Cd', 'APC_Desc', 'Bene_Cnt', 'CAPC_Srvcs', 'Avg_Tot_Sbmtd_Chrgs', 'Avg_Mdcr_Alowd_Amt', 'Avg_Mdcr_Pymt_Amt', 'Outlier_Srvcs', 'Avg_Mdcr_Outlier_Amt']

First 3 rows preview:
Rndrng_Prvdr_CCN           Rndrng_Prvdr_Org_Name        Rndrng_Prvdr_St Rndrng_Prvdr_City Rndrng_Prvdr_State_Abrvtn Rndrng_Prvdr_State_FIPS Rndrng_Prvdr_Zip5 Rndrng_Prvdr_RUCA                                                                               Rndrng_Prvdr_RUCA_Desc APC_Cd                                                APC_Desc Bene_Cnt CAPC_Srvcs Avg_Tot_Sbmtd_Chrgs Avg_Mdcr_Alowd_Amt Avg_Mdcr_Pymt_Amt Outlier_Srvcs Avg_Mdcr_Outlier_Amt
          010001 Southeast Healt

In [10]:
import pandas as pd
import numpy as np

def clean_medicare_outpatient_by_provider_apc(df, out_csv='medicare_outpatient_by_provider_apc_cleaned.csv'):
    """Clean the Medicare Outpatient Hospitals - by Provider and APC dataset."""
    # Step 1: Rename columns
    column_mapping = {
        'Rndrng_Prvdr_CCN': 'provider_ccn',
        'Rndrng_Prvdr_Org_Name': 'provider_org_name',
        'Rndrng_Prvdr_St': 'provider_street_address',
        'Rndrng_Prvdr_City': 'provider_city',
        'Rndrng_Prvdr_State_Abrvtn': 'provider_state_abbreviation',
        'Rndrng_Prvdr_State_FIPS': 'provider_state_fips',
        'Rndrng_Prvdr_Zip5': 'provider_zip5',
        'Rndrng_Prvdr_RUCA': 'provider_ruca_code',
        'Rndrng_Prvdr_RUCA_Desc': 'provider_ruca_description',
        'APC_Cd': 'apc_code',
        'APC_Desc': 'apc_description',
        'Bene_Cnt': 'beneficiary_count',
        'CAPC_Srvcs': 'apc_services',
        'Avg_Tot_Sbmtd_Chrgs': 'average_total_submitted_charges',
        'Avg_Mdcr_Alowd_Amt': 'average_medicare_allowed_amount',
        'Avg_Mdcr_Pymt_Amt': 'average_medicare_payment_amount',
        'Outlier_Srvcs': 'outlier_services',
        'Avg_Mdcr_Outlier_Amt': 'average_medicare_outlier_amount'
    }
    df = df.rename(columns=column_mapping)
    
    # Step 2: Data type conversion
    # String columns
    string_cols = ['provider_ccn', 'provider_state_fips', 'provider_zip5', 
                   'provider_state_abbreviation', 'apc_code']
    for col in string_cols:
        df[col] = df[col].astype(str).str.strip()
    
    # Numeric columns (handle empty strings as NaN)
    numeric_cols = ['beneficiary_count', 'apc_services', 'outlier_services']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')  # Nullable integer
    
    financial_cols = ['average_total_submitted_charges', 'average_medicare_allowed_amount',
                     'average_medicare_payment_amount', 'average_medicare_outlier_amount']
    for col in financial_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').round(2)
    
    # Step 3: Clean string columns
    string_clean_cols = ['provider_org_name', 'provider_street_address', 'provider_city',
                        'provider_ruca_description', 'apc_description']
    for col in string_clean_cols:
        df[col] = df[col].str.strip().str.replace(r'\\\/', '/', regex=True)  # Fix escaped slashes
    
    # Step 4: Validation
    # Check for duplicates
    duplicates = df.duplicated(subset=['provider_ccn', 'apc_code']).sum()
    if duplicates > 0:
        print(f"Warning: Found {duplicates} duplicate rows based on provider_ccn and apc_code.")
    
    # Check for invalid values (e.g., negative counts or payments)
    for col in numeric_cols:
        if (df[col] < 0).any():
            print(f"Warning: Negative values found in {col}.")
    for col in financial_cols:
        if (df[col] < 0).any():
            print(f"Warning: Negative values found in {col}.")
    
    # Step 5: Save cleaned dataset
    df.to_csv(out_csv, index=False)
    print(f"\nCleaned dataset saved to {out_csv}")
    print(f"Rows: {len(df)}, Columns: {list(df.columns)}")
    print("\nFirst 3 rows preview:")
    print(df.head(3).to_string(index=False))
    
    return df

# Usage: Assuming df is loaded from fetch_medicare_outpatient_by_provider_apc
df = fetch_medicare_outpatient_by_provider_apc()  # Run fetch first
df_cleaned = clean_medicare_outpatient_by_provider_apc(df)

Fetching API: https://data.cms.gov/data-api/v1/dataset/ccbc9a44-40d4-46b4-a709-5caa59212e50/data
Parsed 1000 rows with columns: ['Rndrng_Prvdr_CCN', 'Rndrng_Prvdr_Org_Name', 'Rndrng_Prvdr_St', 'Rndrng_Prvdr_City', 'Rndrng_Prvdr_State_Abrvtn', 'Rndrng_Prvdr_State_FIPS', 'Rndrng_Prvdr_Zip5', 'Rndrng_Prvdr_RUCA', 'Rndrng_Prvdr_RUCA_Desc', 'APC_Cd', 'APC_Desc', 'Bene_Cnt', 'CAPC_Srvcs', 'Avg_Tot_Sbmtd_Chrgs', 'Avg_Mdcr_Alowd_Amt', 'Avg_Mdcr_Pymt_Amt', 'Outlier_Srvcs', 'Avg_Mdcr_Outlier_Amt']

First 3 rows preview:
Rndrng_Prvdr_CCN           Rndrng_Prvdr_Org_Name        Rndrng_Prvdr_St Rndrng_Prvdr_City Rndrng_Prvdr_State_Abrvtn Rndrng_Prvdr_State_FIPS Rndrng_Prvdr_Zip5 Rndrng_Prvdr_RUCA                                                                               Rndrng_Prvdr_RUCA_Desc APC_Cd                                                APC_Desc Bene_Cnt CAPC_Srvcs Avg_Tot_Sbmtd_Chrgs Avg_Mdcr_Alowd_Amt Avg_Mdcr_Pymt_Amt Outlier_Srvcs Avg_Mdcr_Outlier_Amt
          010001 Southeast Healt