In [63]:
import polars as pl

# Load the data
facility_df = pl.read_parquet('../../003_data/001_raw-data/2017-2024_national_cms_dialysis-facility_data.parquet')
cahps_df = pl.read_parquet('../../003_data/001_raw-data/2017-2024_national_cms_dialysis-facility_cahps-data.parquet')

# Display the first few rows of the cahps data
cahps_df.head()

# Filter the data for California
cahps_df = cahps_df.filter(pl.col('state').is_in(['CA']))


In [64]:
# Storing the initial number of rows to compare after cleaning
initial_row_count = cahps_df.shape[0]

# Defining a function to safely rename columns with the same name
def safe_rename(df, old_name, new_name):
    if old_name in df.columns and new_name not in df.columns:
        return df.rename({old_name: new_name})
    return df

# Check missing values before renaming
print("Missing values before renaming:")
print(f"'county': {cahps_df['county'].null_count()}")
print(f"'countyparish': {cahps_df['countyparish'].null_count()}")

# Safely rename columns
cahps_df = safe_rename(cahps_df, 'zip', 'zip_code')
cahps_df = safe_rename(cahps_df, 'telephone_number', 'phone_number')
cahps_df = safe_rename(cahps_df, 'citytown', 'city')
cahps_df = safe_rename(cahps_df, 'facility_name_', 'facility_name')
cahps_df = safe_rename(cahps_df, 'facility', 'facility_name')
cahps_df = safe_rename(cahps_df, 'state_', 'state')
cahps_df = safe_rename(cahps_df, 'chain_organization_', 'chain_organization')
cahps_df = safe_rename(cahps_df, 'ich_cahps_survey_of_patients_experiences_star_rating_', 'ich_cahps_survey_of_patients_experiences_star_rating')

# Merge 'countyparish' and 'county' columns
cahps_df = cahps_df.with_columns(pl.coalesce('county', 'countyparish').alias('county'))

# Changing the name of columns with the string patientsrating to patients_rating
for col in cahps_df.columns:
    if 'patientsrating' in col:
        new_col = col.replace('patientsrating', 'patients_rating')
        if new_col in cahps_df.columns:
            # Merge columns if the new name already exists
            cahps_df = cahps_df.with_columns(pl.coalesce(new_col, col).alias(new_col))
            cahps_df = cahps_df.drop(col)
        else:
            cahps_df = cahps_df.rename({col: new_col})

# Drop the original, changed column names
cahps_df = cahps_df.drop('zip', 'telephone_number', 'countyparish', 'facility_name_', 'facility', 'state_', 'chain_organization_', 'ich_cahps_survey_of_patients_experiences_star_rating_', 'citytown')

# Check missing values after processing
print("\nMissing values after processing:")
print(f"'county': {cahps_df['county'].null_count()}")

# Check if the number of rows has changed
final_row_count = cahps_df.shape[0]
if initial_row_count == final_row_count:
    print(f"\nNo rows were dropped. Row count remains {final_row_count}.")
else:
    print(f"\nWarning: Row count changed from {initial_row_count} to {final_row_count}.")

# Some data validation using null values in county column
if cahps_df['county'].null_count() > 0:
    print("\nInvestigating remaining missing values in 'county':")
    missing_county = cahps_df.filter(pl.col('county').is_null())
    print(missing_county.select(['facility_name', 'city', 'state', 'county']))

Missing values before renaming:
'county': 1478
'countyparish': 3567

Missing values after processing:
'county': 0

No rows were dropped. Row count remains 5045.


In [None]:
# cols i care about

linearized_score_of_nephrologists_communication_and_caring, 
star_rating_of_nephrologists_communication_and_caring,
linearized_score_of_quality_of_dialysis_center_care_and_operations,
star_rating_of_quality_of_dialysis_center_care_and_operations,
linearized_score_of_providing_information_to_patients,
star_rating_of_providing_information_to_patients,
linearized_score_of_rating_of_the_nephrologist,
star_rating_of_the_nephrologist,
linearized_score_of_rating_of_the_dialysis_center_staff, 
star_rating_of_the_dialysis_center_staff,
linearized_score_of_rating_of_the_dialysis_facility,
star_rating_of_the_dialysis_facility,
total_number_of_completed_interviews_from_the_fall_and_spring_surveys,
ich_cahps_survey_of_patients_experiences_star_rating,
survey_response_rate,
year,
month, 
ichcahps_survey_response_rate,
overall_ich_cahps_survey_of_patients_experiences_star_rating,
ich_cahps_quality_of_patient_care_star_rating

