# Hospital Financial Reports Explorer (Parquet Version)

This notebook queries CMS HCRIS worksheet data directly from **parquet files** in `data/worksheets/`.

## Data Source
- **Source**: Hive-partitioned parquet files by `state_code` and `fiscal_year`
- **Location**: `data/worksheets/{worksheet_code}/state_code=XX/fiscal_year=YYYY/*.parquet`
- **Worksheets**: 25 worksheets (A, B, C, G, S series)

## Available Reports
1. **Balance Sheet** (G000000) - Assets, liabilities, and equity
2. **Income Statement** (G300000) - Revenues and expenses
3. **Patient Revenue Statement** (G200000) - Revenue by payer and service
4. **Fund Balance Changes** (G100000) - Changes in fund balances
5. **Cost Allocation Summary** (B100000) - Allocated costs by cost center
6. **Utilization Statistics** (S300001) - Beds, days, admissions, visits

---

In [None]:
# Import required libraries
import duckdb
import pandas as pd
import numpy as np
from pathlib import Path
import ipywidgets as widgets
from IPython.display import display, HTML
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', '{:.2f}'.format)

print("‚úì Libraries imported successfully")

In [None]:
# Configuration
BASE_DIR = Path('.')
WORKSHEETS_DIR = BASE_DIR / 'data' / 'worksheets'

print(f"‚úì Base directory: {BASE_DIR.absolute()}")
print(f"‚úì Worksheets directory: {WORKSHEETS_DIR.absolute()}")
print(f"‚úì Directory exists: {WORKSHEETS_DIR.exists()}")

In [None]:
# Get available hospitals and years from parquet files
con = duckdb.connect(':memory:')

# Use G000000 (Balance Sheet) as reference for available providers
parquet_pattern = str(WORKSHEETS_DIR / 'g000000' / '**' / '*.parquet')

# Get hospitals with their metadata
providers_df = con.execute(f"""
    SELECT DISTINCT
        Provider_Number,
        state_code
    FROM read_parquet('{parquet_pattern}', hive_partitioning=1)
    ORDER BY state_code, Provider_Number
""").df()

# Get available fiscal years
years_df = con.execute(f"""
    SELECT DISTINCT fiscal_year
    FROM read_parquet('{parquet_pattern}', hive_partitioning=1)
    ORDER BY fiscal_year DESC
""").df()

con.close()

print(f"‚úì Found {len(providers_df)} hospitals")
print(f"‚úì States: {', '.join(sorted(providers_df['state_code'].unique()))}")
print(f"‚úì Fiscal years: {', '.join(map(str, years_df['fiscal_year'].tolist()))}")

## Hospital Selection

Select a hospital and fiscal year to view financial reports.

In [None]:
# Create selection widgets
provider_options = [
    (f"{row['Provider_Number']} ({row['state_code']})", row['Provider_Number'])
    for _, row in providers_df.iterrows()
]

provider_dropdown = widgets.Dropdown(
    options=provider_options,
    description='Hospital:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

year_dropdown = widgets.Dropdown(
    options=[(str(year), year) for year in years_df['fiscal_year'].tolist()],
    description='Fiscal Year:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='200px')
)

display(widgets.HBox([provider_dropdown, year_dropdown]))
print("\nüëÜ Select hospital and fiscal year above")

## Helper Functions

In [None]:
def format_millions(value):
    """Format value in millions with 2 decimals"""
    if pd.isna(value) or value == 0:
        return 0.00
    return round(value / 1e6, 2)

def style_dataframe(df):
    """Apply styling to dataframe for better readability"""
    return df.style.format({
        col: '{:,.2f}' for col in df.select_dtypes(include=[np.number]).columns
    }).set_properties(**{
        'text-align': 'right'
    }, subset=df.select_dtypes(include=[np.number]).columns).set_properties(**{
        'text-align': 'left'
    }, subset=df.select_dtypes(include=['object']).columns)

def get_worksheet_path(worksheet_code):
    """Get parquet file pattern for a worksheet"""
    return str(WORKSHEETS_DIR / worksheet_code.lower() / '**' / '*.parquet')

print("‚úì Helper functions defined")

---

## 1Ô∏è‚É£ Balance Sheet (G000000)

Complete hospital balance sheet showing assets, liabilities, and equity at fiscal year end.

In [None]:
def get_balance_sheet(provider_num, year):
    """Get balance sheet for a specific hospital and year"""
    con = duckdb.connect(':memory:')
    
    parquet_path = get_worksheet_path('g000000')
    
    df = con.execute(f"""
        SELECT
            Line,
            line_level1 as Category,
            line_level2 as Subcategory,
            "Column",
            col_level1 as Time_Period,
            Value
        FROM read_parquet('{parquet_path}', hive_partitioning=1)
        WHERE Provider_Number = ?
            AND fiscal_year = ?
        ORDER BY Line, "Column"
    """, [provider_num, int(year)]).df()
    
    con.close()
    
    if df.empty:
        print(f"‚ö†Ô∏è No balance sheet data found for Provider {provider_num}, Year {year}")
        return None
    
    # Pivot to show Beginning and Ending columns side by side
    pivot_df = df.pivot_table(
        index=['Line', 'Category', 'Subcategory'],
        columns='Time_Period',
        values='Value',
        aggfunc='first'
    ).reset_index()
    
    # Convert to millions
    for col in pivot_df.select_dtypes(include=[np.number]).columns:
        pivot_df[f'{col} ($M)'] = pivot_df[col].apply(format_millions)
        pivot_df = pivot_df.drop(col, axis=1)
    
    return pivot_df

# Get and display balance sheet
balance_sheet = get_balance_sheet(provider_dropdown.value, year_dropdown.value)

if balance_sheet is not None:
    print(f"\nüìä Balance Sheet (G000000)")
    print(f"Provider: {provider_dropdown.value} | Fiscal Year: {year_dropdown.value}")
    print(f"Note: All amounts in millions (USD)")
    print(f"Total line items: {len(balance_sheet):,}\n")
    display(style_dataframe(balance_sheet))

---

## 2Ô∏è‚É£ Income Statement (G300000)

Statement of revenues and expenses for the fiscal year.

In [None]:
def get_income_statement(provider_num, year):
    """Get income statement for a specific hospital and year"""
    con = duckdb.connect(':memory:')
    
    parquet_path = get_worksheet_path('g300000')
    
    df = con.execute(f"""
        SELECT
            Line,
            line_level1 as Category,
            line_level2 as Account,
            Value
        FROM read_parquet('{parquet_path}', hive_partitioning=1)
        WHERE Provider_Number = ?
            AND fiscal_year = ?
        ORDER BY Line
    """, [provider_num, int(year)]).df()
    
    con.close()
    
    if df.empty:
        print(f"‚ö†Ô∏è No income statement data found for Provider {provider_num}, Year {year}")
        return None
    
    # Convert to millions
    df['Value ($M)'] = df['Value'].apply(format_millions)
    df = df.drop('Value', axis=1)
    
    return df

# Get and display income statement
income_statement = get_income_statement(provider_dropdown.value, year_dropdown.value)

if income_statement is not None:
    print(f"\nüí∞ Income Statement (G300000)")
    print(f"Provider: {provider_dropdown.value} | Fiscal Year: {year_dropdown.value}")
    print(f"Note: All amounts in millions (USD)")
    print(f"Total line items: {len(income_statement):,}\n")
    display(style_dataframe(income_statement))

---

## 3Ô∏è‚É£ Patient Revenue Statement (G200000)

Patient revenue breakdown by payer source and service type.

In [None]:
def get_patient_revenue(provider_num, year):
    """Get patient revenue statement for a specific hospital and year"""
    con = duckdb.connect(':memory:')
    
    parquet_path = get_worksheet_path('g200000')
    
    df = con.execute(f"""
        SELECT
            Line,
            line_level1 as Category,
            line_level2 as Subcategory,
            "Column",
            col_level1 as Payer_Type,
            Value
        FROM read_parquet('{parquet_path}', hive_partitioning=1)
        WHERE Provider_Number = ?
            AND fiscal_year = ?
        ORDER BY Line, "Column"
    """, [provider_num, int(year)]).df()
    
    con.close()
    
    if df.empty:
        print(f"‚ö†Ô∏è No patient revenue data found for Provider {provider_num}, Year {year}")
        return None
    
    # Pivot to show different payer types as columns
    pivot_df = df.pivot_table(
        index=['Line', 'Category', 'Subcategory'],
        columns='Payer_Type',
        values='Value',
        aggfunc='first'
    ).reset_index()
    
    # Convert to millions
    for col in pivot_df.select_dtypes(include=[np.number]).columns:
        pivot_df[f'{col} ($M)'] = pivot_df[col].apply(format_millions)
        pivot_df = pivot_df.drop(col, axis=1)
    
    return pivot_df

# Get and display patient revenue
patient_revenue = get_patient_revenue(provider_dropdown.value, year_dropdown.value)

if patient_revenue is not None:
    print(f"\nüìà Patient Revenue Statement (G200000)")
    print(f"Provider: {provider_dropdown.value} | Fiscal Year: {year_dropdown.value}")
    print(f"Note: All amounts in millions (USD)")
    print(f"Total line items: {len(patient_revenue):,}\n")
    display(style_dataframe(patient_revenue))

---

## 4Ô∏è‚É£ Fund Balance Changes (G100000)

Statement showing changes in fund balances during the fiscal year.

In [None]:
def get_fund_balance_changes(provider_num, year):
    """Get fund balance changes for a specific hospital and year"""
    con = duckdb.connect(':memory:')
    
    parquet_path = get_worksheet_path('g100000')
    
    df = con.execute(f"""
        SELECT
            Line,
            line_level1 as Category,
            line_level2 as Account,
            "Column",
            col_level1 as Fund_Type,
            Value
        FROM read_parquet('{parquet_path}', hive_partitioning=1)
        WHERE Provider_Number = ?
            AND fiscal_year = ?
        ORDER BY Line, "Column"
    """, [provider_num, int(year)]).df()
    
    con.close()
    
    if df.empty:
        print(f"‚ö†Ô∏è No fund balance changes data found for Provider {provider_num}, Year {year}")
        return None
    
    # Pivot to show different fund types as columns
    pivot_df = df.pivot_table(
        index=['Line', 'Category', 'Account'],
        columns='Fund_Type',
        values='Value',
        aggfunc='first'
    ).reset_index()
    
    # Convert to millions
    for col in pivot_df.select_dtypes(include=[np.number]).columns:
        pivot_df[f'{col} ($M)'] = pivot_df[col].apply(format_millions)
        pivot_df = pivot_df.drop(col, axis=1)
    
    return pivot_df

# Get and display fund balance changes
fund_changes = get_fund_balance_changes(provider_dropdown.value, year_dropdown.value)

if fund_changes is not None:
    print(f"\nüíº Fund Balance Changes (G100000)")
    print(f"Provider: {provider_dropdown.value} | Fiscal Year: {year_dropdown.value}")
    print(f"Note: All amounts in millions (USD)")
    print(f"Total line items: {len(fund_changes):,}\n")
    display(style_dataframe(fund_changes))

---

## 5Ô∏è‚É£ Cost Allocation Summary (B100000)

Total allocated costs by cost center after stepdown allocation.

In [None]:
def get_cost_allocation(provider_num, year):
    """Get cost allocation summary for a specific hospital and year"""
    con = duckdb.connect(':memory:')
    
    parquet_path = get_worksheet_path('b100000')
    
    df = con.execute(f"""
        SELECT
            Line,
            line_level1 as Cost_Center_Type,
            line_level2 as Cost_Center,
            "Column",
            col_level1 as Cost_Component,
            Value
        FROM read_parquet('{parquet_path}', hive_partitioning=1)
        WHERE Provider_Number = ?
            AND fiscal_year = ?
        ORDER BY Line, "Column"
    """, [provider_num, int(year)]).df()
    
    con.close()
    
    if df.empty:
        print(f"‚ö†Ô∏è No cost allocation data found for Provider {provider_num}, Year {year}")
        return None
    
    # Pivot to show cost components as columns
    pivot_df = df.pivot_table(
        index=['Line', 'Cost_Center_Type', 'Cost_Center'],
        columns='Cost_Component',
        values='Value',
        aggfunc='first'
    ).reset_index()
    
    # Convert to millions
    for col in pivot_df.select_dtypes(include=[np.number]).columns:
        pivot_df[f'{col} ($M)'] = pivot_df[col].apply(format_millions)
        pivot_df = pivot_df.drop(col, axis=1)
    
    return pivot_df

# Get and display cost allocation
cost_allocation = get_cost_allocation(provider_dropdown.value, year_dropdown.value)

if cost_allocation is not None:
    print(f"\nüí∏ Cost Allocation Summary (B100000)")
    print(f"Provider: {provider_dropdown.value} | Fiscal Year: {year_dropdown.value}")
    print(f"Note: All amounts in millions (USD)")
    print(f"Total cost centers: {len(cost_allocation):,}\n")
    display(style_dataframe(cost_allocation))

---

## 6Ô∏è‚É£ Utilization Statistics (S300001)

Hospital utilization metrics including beds, patient days, admissions, and visits.

In [None]:
def get_utilization_stats(provider_num, year):
    """Get utilization statistics for a specific hospital and year"""
    con = duckdb.connect(':memory:')
    
    parquet_path = get_worksheet_path('s300001')
    
    df = con.execute(f"""
        SELECT
            Line,
            line_level1 as Category,
            line_level2 as Metric,
            "Column",
            col_level1 as Service_Type,
            Value
        FROM read_parquet('{parquet_path}', hive_partitioning=1)
        WHERE Provider_Number = ?
            AND fiscal_year = ?
        ORDER BY Line, "Column"
    """, [provider_num, int(year)]).df()
    
    con.close()
    
    if df.empty:
        print(f"‚ö†Ô∏è No utilization statistics found for Provider {provider_num}, Year {year}")
        return None
    
    # Pivot to show service types as columns
    pivot_df = df.pivot_table(
        index=['Line', 'Category', 'Metric'],
        columns='Service_Type',
        values='Value',
        aggfunc='first'
    ).reset_index()
    
    # Format numbers (no millions conversion for counts)
    for col in pivot_df.select_dtypes(include=[np.number]).columns:
        pivot_df[col] = pivot_df[col].round(0)
    
    return pivot_df

# Get and display utilization stats
utilization = get_utilization_stats(provider_dropdown.value, year_dropdown.value)

if utilization is not None:
    print(f"\nüè• Utilization Statistics (S300001)")
    print(f"Provider: {provider_dropdown.value} | Fiscal Year: {year_dropdown.value}")
    print(f"Note: Values are counts/days (not in millions)")
    print(f"Total metrics: {len(utilization):,}\n")
    display(style_dataframe(utilization))

---

## 7Ô∏è‚É£ Key Performance Indicators (KPIs)

Calculate financial and operational KPIs from multiple worksheets.

In [None]:
def calculate_kpis(provider_num, year):
    """Calculate key performance indicators for a hospital"""
    
    con = duckdb.connect(':memory:')
    
    kpis = {}
    
    try:
        # Get Balance Sheet data (G000000)
        g000_path = get_worksheet_path('g000000')
        balance_sheet = con.execute(f"""
            SELECT Line, "Column", Value
            FROM read_parquet('{g000_path}', hive_partitioning=1)
            WHERE Provider_Number = ? AND fiscal_year = ?
        """, [provider_num, int(year)]).df()
        
        # Get Income Statement data (G300000)
        g300_path = get_worksheet_path('g300000')
        income_stmt = con.execute(f"""
            SELECT Line, Value
            FROM read_parquet('{g300_path}', hive_partitioning=1)
            WHERE Provider_Number = ? AND fiscal_year = ?
        """, [provider_num, int(year)]).df()
        
        # Get Utilization data (S300001)
        s300_path = get_worksheet_path('s300001')
        utilization = con.execute(f"""
            SELECT Line, "Column", Value
            FROM read_parquet('{s300_path}', hive_partitioning=1)
            WHERE Provider_Number = ? AND fiscal_year = ?
        """, [provider_num, int(year)]).df()
        
        # Extract key values
        def get_value(df, line, column=None):
            if column:
                filtered = df[(df['Line'] == line) & (df['Column'] == column)]
            else:
                filtered = df[df['Line'] == line]
            return filtered['Value'].iloc[0] if not filtered.empty else 0
        
        # Balance Sheet KPIs
        total_assets_end = get_value(balance_sheet, '03000', '00100')  # Total Assets - End of Year
        current_assets = get_value(balance_sheet, '00300', '00100')  # Current Assets - End
        current_liabilities = get_value(balance_sheet, '04300', '00100')  # Current Liabilities - End
        total_liabilities = get_value(balance_sheet, '05000', '00100')  # Total Liabilities - End
        
        # Income Statement KPIs
        total_revenue = get_value(income_stmt, '00300')  # Total Operating Revenue
        total_expenses = get_value(income_stmt, '01300')  # Total Operating Expenses
        net_income = get_value(income_stmt, '02900')  # Net Income
        
        # Utilization KPIs
        beds_available = get_value(utilization, '01400', '00100')  # Beds Available
        patient_days = get_value(utilization, '01500', '00100')  # Total Patient Days
        admissions = get_value(utilization, '02000', '00100')  # Total Admissions
        
        # Calculate ratios
        kpis['Total Assets ($M)'] = format_millions(total_assets_end)
        kpis['Total Revenue ($M)'] = format_millions(total_revenue)
        kpis['Total Expenses ($M)'] = format_millions(total_expenses)
        kpis['Net Income ($M)'] = format_millions(net_income)
        
        # Financial Ratios
        kpis['Operating Margin (%)'] = round((net_income / total_revenue * 100), 2) if total_revenue else 0
        kpis['Current Ratio'] = round(current_assets / current_liabilities, 2) if current_liabilities else 0
        kpis['Debt-to-Asset Ratio'] = round(total_liabilities / total_assets_end, 2) if total_assets_end else 0
        
        # Utilization Metrics
        kpis['Beds Available'] = int(beds_available)
        kpis['Patient Days'] = int(patient_days)
        kpis['Admissions'] = int(admissions)
        kpis['Average Daily Census'] = round(patient_days / 365, 1) if patient_days else 0
        kpis['Occupancy Rate (%)'] = round((patient_days / (beds_available * 365) * 100), 1) if beds_available else 0
        kpis['Average Length of Stay'] = round(patient_days / admissions, 1) if admissions else 0
        
        # Per-Unit Metrics
        kpis['Revenue per Patient Day ($)'] = round(total_revenue / patient_days, 2) if patient_days else 0
        kpis['Cost per Patient Day ($)'] = round(total_expenses / patient_days, 2) if patient_days else 0
        
    except Exception as e:
        print(f"‚ö†Ô∏è Error calculating KPIs: {str(e)}")
    
    finally:
        con.close()
    
    return kpis

# Calculate and display KPIs
kpis = calculate_kpis(provider_dropdown.value, year_dropdown.value)

if kpis:
    print(f"\nüìä Key Performance Indicators")
    print(f"Provider: {provider_dropdown.value} | Fiscal Year: {year_dropdown.value}\n")
    
    kpis_df = pd.DataFrame(list(kpis.items()), columns=['Metric', 'Value'])
    display(kpis_df.style.set_properties(**{'text-align': 'left'}))

---

## 8Ô∏è‚É£ Export to Excel

Export all financial reports to a single Excel file with multiple sheets.

In [None]:
def export_to_excel(provider_num, year):
    """Export all financial reports to Excel file"""
    filename = f"Financial_Reports_{provider_num}_FY{year}_Parquet.xlsx"
    
    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        # Balance Sheet
        bs = get_balance_sheet(provider_num, year)
        if bs is not None:
            bs.to_excel(writer, sheet_name='Balance Sheet', index=False)
        
        # Income Statement
        income = get_income_statement(provider_num, year)
        if income is not None:
            income.to_excel(writer, sheet_name='Income Statement', index=False)
        
        # Patient Revenue
        revenue = get_patient_revenue(provider_num, year)
        if revenue is not None:
            revenue.to_excel(writer, sheet_name='Patient Revenue', index=False)
        
        # Fund Balance Changes
        fund_changes = get_fund_balance_changes(provider_num, year)
        if fund_changes is not None:
            fund_changes.to_excel(writer, sheet_name='Fund Balance Changes', index=False)
        
        # Cost Allocation
        costs = get_cost_allocation(provider_num, year)
        if costs is not None:
            costs.to_excel(writer, sheet_name='Cost Allocation', index=False)
        
        # Utilization
        util = get_utilization_stats(provider_num, year)
        if util is not None:
            util.to_excel(writer, sheet_name='Utilization', index=False)
        
        # KPIs
        kpis = calculate_kpis(provider_num, year)
        if kpis:
            kpis_df = pd.DataFrame(list(kpis.items()), columns=['Metric', 'Value'])
            kpis_df.to_excel(writer, sheet_name='KPIs', index=False)
    
    print(f"‚úì Exported to {filename}")
    return filename

# Uncomment to export:
# export_to_excel(provider_dropdown.value, year_dropdown.value)