# Data Cleaning for HCAHPS

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
url = 'https://data.cms.gov/provider-data/dataset/dgck-syfz#data-dictionary'

df = pd.read_csv('../hcahps/data/HCAHPS-Hospital.csv')
df

In [None]:
df['HCAHPS Question'].unique()

In [None]:
def get_hospital_comparison(df, facility_name, measure_type):
    """
    Compare a specific hospital's performance against state and national averages
   
    Args:
    df: DataFrame containing HCAHPS data
    facility_name: Name of the hospital to analyze
    measure_type: Type of measure to analyze
    """
    
    # Filter for the specific measure
    measure_df = df[df['HCAHPS Question'] == measure_type].copy()
    
    # Convert 'HCAHPS Answer Percent' to numeric, handling 'Not Available'
    measure_df['HCAHPS Answer Percent'] = pd.to_numeric(measure_df['HCAHPS Answer Percent'], errors='coerce')
    
    # Get hospital's data
    hospital = measure_df[measure_df['Facility Name'] == facility_name].iloc[0]
    
    # Get state average
    state = hospital['State']
    state_avg = measure_df[measure_df['State'] == state]['HCAHPS Answer Percent'].mean()
    
    # Get national average
    national_avg = measure_df['HCAHPS Answer Percent'].mean()
    
    # Calculate percentile rank (excluding NA values)
    valid_scores = measure_df['HCAHPS Answer Percent'].dropna()
    percentile = (valid_scores < hospital['HCAHPS Answer Percent']).mean() * 100
    
    comparison = {
        'hospital_name': facility_name,
        'hospital_score': hospital['HCAHPS Answer Percent'],
        'state_average': state_avg,
        'national_average': national_avg,
        'percentile_rank': percentile,
        'completed_surveys': hospital['Number of Completed Surveys']
    }
    
    return comparison

def visualize_trends(df, state=None):
    """
    Create visualization of nurse communication scores
    Optionally filter by state
    """
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # Filter for nurse communication measure
    nurse_comm = df[df['HCAHPS Question'].str.contains('nurses.*communicated', case=False, na=False)]
    
    if state:
        nurse_comm = nurse_comm[nurse_comm['State'] == state]
    
    # Create distribution plot
    plt.figure(figsize=(10, 6))
    sns.histplot(data=nurse_comm, x='HCAHPS Answer Percent', bins=30)
    plt.title(f'Distribution of Nurse Communication Scores {"in " + state if state else "Nationally"}')
    plt.xlabel('Always Communicated Well (%)')
    plt.ylabel('Number of Hospitals')
    
    return plt

In [None]:
get_hospital_comparison(df, 'RUSH UNIVERSITY MEDICAL CENTER', 'Patients who reported YES, they would definitely recommend the hospital')

# Let's find out which areas a hospital suffers the most in

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import textwrap

def analyze_hospital_performance(df, facility_name):
    """
    Analyzes a hospital's performance across all HCAHPS questions and visualizes
    the top 5 and bottom 5 performing areas with national average comparisons.
    
    Args:
    df (pd.DataFrame): DataFrame containing HCAHPS data
    facility_name (str): Name of the hospital to analyze
    
    Returns:
    dict: Dictionary containing the analysis results and creates a visualization
    """
    # Create a copy of the dataframe
    df = df.copy()
    
    # Convert percentages to numeric, handling 'Not Available'
    df['HCAHPS Answer Percent'] = pd.to_numeric(df['HCAHPS Answer Percent'], errors='coerce')
    
    # Initialize results dictionary
    results = []
    
    # Get unique questions
    questions = df['HCAHPS Question'].unique()
    
    # Calculate metrics for each question
    for question in questions:
        question_df = df[df['HCAHPS Question'] == question]
        
        try:
            # Get hospital's score
            hospital_data = question_df[question_df['Facility Name'] == facility_name].iloc[0]
            hospital_score = hospital_data['HCAHPS Answer Percent']
            
            # Skip if hospital score is NaN
            if pd.isna(hospital_score):
                continue
                
            # Get valid scores for this question (excluding NaN)
            valid_scores = question_df['HCAHPS Answer Percent'].dropna()
            
            # Skip if not enough valid scores for comparison
            if len(valid_scores) < 2:
                continue
                
            # Calculate national average
            national_avg = valid_scores.mean()
            
            # Calculate percentile
            percentile = (valid_scores < hospital_score).mean() * 100
            
            # Add to results
            results.append({
                'question': question,
                'score': hospital_score,
                'national_avg': national_avg,
                'percentile': percentile,
                'num_hospitals': len(valid_scores)
            })
        except (IndexError, KeyError):
            continue
    
    # Convert to DataFrame and sort
    results_df = pd.DataFrame(results)
    
    # Ensure we have valid results
    if len(results_df) == 0:
        raise ValueError(f"No valid metrics found for {facility_name}")
    
    results_df = results_df.sort_values('percentile', ascending=False)
    
    # Get top 5 and bottom 5
    top_5 = results_df.head(5)
    bottom_5 = results_df.tail(5)
    
    # Create visualization
    plt.figure(figsize=(15, 10))
    
    # Helper function to wrap text
    def wrap_labels(text, num_hospitals, width=40):
        wrapped = textwrap.fill(text, width=width, break_long_words=False, break_on_hyphens=False)
        return f"{wrapped}\n({num_hospitals} hospitals)"
    
    # Plot top 5
    ax1 = plt.subplot(1, 2, 1)
    y_pos = range(5)
    bar_height = 0.35
    
    # Plot hospital scores and national averages for top 5
    hospital_bars = ax1.barh([p + bar_height for p in y_pos], 
                            top_5['score'], 
                            height=bar_height,
                            color='green', 
                            alpha=0.6,
                            label=facility_name)
    
    natl_bars = ax1.barh(y_pos, 
                         top_5['national_avg'], 
                         height=bar_height,
                         color='blue', 
                         alpha=0.3,
                         label='National Average')
    
    ax1.set_yticks([p + bar_height/2 for p in y_pos])
    ax1.set_yticklabels([wrap_labels(q, n) for q, n in 
                         zip(top_5['question'], top_5['num_hospitals'])],
                        fontsize=8)
    
    ax1.set_xlabel('Score (%)')
    ax1.set_title('Top 5 Performing Metrics')
    ax1.legend(loc='lower right')
    
    # Add score labels
    for bars in [hospital_bars, natl_bars]:
        for bar in bars:
            width = bar.get_width()
            ax1.text(width, bar.get_y() + bar.get_height()/2,
                    f'{width:.1f}%',
                    ha='left', va='center',
                    fontsize=8)
    
    # Plot bottom 5
    ax2 = plt.subplot(1, 2, 2)
    
    # Plot hospital scores and national averages for bottom 5
    hospital_bars = ax2.barh([p + bar_height for p in y_pos],
                            bottom_5['score'],
                            height=bar_height,
                            color='red',
                            alpha=0.6,
                            label=facility_name)
    
    natl_bars = ax2.barh(y_pos,
                         bottom_5['national_avg'],
                         height=bar_height,
                         color='blue',
                         alpha=0.3,
                         label='National Average')
    
    ax2.set_yticks([p + bar_height/2 for p in y_pos])
    ax2.set_yticklabels([wrap_labels(q, n) for q, n in 
                         zip(bottom_5['question'], bottom_5['num_hospitals'])],
                        fontsize=8)
    
    ax2.set_xlabel('Score (%)')
    ax2.set_title('Bottom 5 Performing Metrics')
    ax2.legend(loc='lower right')
    
    # Add score labels
    for bars in [hospital_bars, natl_bars]:
        for bar in bars:
            width = bar.get_width()
            ax2.text(width, bar.get_y() + bar.get_height()/2,
                    f'{width:.1f}%',
                    ha='left', va='center',
                    fontsize=8)
    
    plt.tight_layout()
    plt.show()
    
    return {
        'top_performers': top_5.to_dict('records'),
        'bottom_performers': bottom_5.to_dict('records')
    }

In [None]:
analyze_hospital_performance(df, 'RUSH UNIVERSITY MEDICAL CENTER')