In [9]:
import polars as pl
from datetime import datetime
import pyarrow.parquet as pq

In [17]:

# Load the data
facility_df = pl.read_parquet('../../003_data/001_raw-data/2017-2024_national_cms_dialysis-facility_data.parquet')
cahps_df = pl.read_parquet('../../003_data/001_raw-data/2017-2024_national_cms_dialysis-facility_cahps-data.parquet')

# Display the first few rows of the cahps data
cahps_df.head()

# Filter the data for California
cahps_df = cahps_df.filter(pl.col('state').is_in(['CA']))
facility_df = facility_df.filter(pl.col('state').is_in(['CA']))



In [18]:
# Storing the initial number of rows to compare after cleaning
initial_row_count = cahps_df.shape[0]

# Check missing values before renaming columns
print("Missing values before renaming:")
print(f"'county': {cahps_df['county'].null_count()}")
print(f"'countyparish': {cahps_df['countyparish'].null_count()}")

# Merge duplicate columns
cahps_df = cahps_df.with_columns(pl.coalesce('city', 'citytown').alias('city'))
cahps_df = cahps_df.with_columns(pl.coalesce('county', 'countyparish').alias('county'))
cahps_df = cahps_df.with_columns(pl.coalesce('zip_code', 'zip').alias('zip_code'))
cahps_df = cahps_df.with_columns(pl.coalesce('phone_number', 'telephone_number').alias('phone_number'))
cahps_df = cahps_df.with_columns(pl.coalesce('facility', 'facility_name').alias('facility_name'))
cahps_df = cahps_df.with_columns(pl.coalesce('facility_name', 'facility_name_').alias('facility_name'))
cahps_df = cahps_df.with_columns(pl.coalesce('state', 'state_').alias('state'))
cahps_df = cahps_df.with_columns(pl.coalesce('chain_organization', 'chain_organization_').alias('chain_organization'))
cahps_df = cahps_df.with_columns(pl.coalesce('ich_cahps_survey_of_patients_experiences_star_rating', 'ich_cahps_survey_of_patients_experiences_star_rating_').alias('ich_cahps_survey_of_patients_experiences_star_rating'))
cahps_df = cahps_df.with_columns(pl.coalesce('overall_ich_cahps_survey_of_patients_experiences_star_rating', 'ich_cahps_survey_of_patients_experiences_star_rating').alias('ich_cahps_survey_of_patients_experiences_star_rating'))

# Changing the name of columns with the string patientsrating to patients_rating
for col in cahps_df.columns:
    if 'patientsrating' in col:
        new_col = col.replace('patientsrating', 'patients_rating')
        if new_col in cahps_df.columns:
            # Merge columns if the new name already exists
            cahps_df = cahps_df.with_columns(pl.coalesce(new_col, col).alias(new_col))
            cahps_df = cahps_df.drop(col)
        else:
            cahps_df = cahps_df.rename({col: new_col})

# Drop the original, changed column names
cahps_df = cahps_df.drop('citytown', 'zip', 'telephone_number', 'countyparish', 'facility_name_', 'facility', 'state_', 'chain_organization_', 'ich_cahps_survey_of_patients_experiences_star_rating_', 'citytown')

# Check missing values after processing
print("\nMissing values after processing:")
print(f"'county': {cahps_df['county'].null_count()}")

# Check if the number of rows has changed
final_row_count = cahps_df.shape[0]
if initial_row_count == final_row_count:
    print(f"\nNo rows were dropped. Row count remains {final_row_count}.")
else:
    print(f"\nWarning: Row count changed from {initial_row_count} to {final_row_count}.")

# Some data validation using null values in county column
if cahps_df['county'].null_count() > 0:
    print("\nInvestigating remaining missing values in 'county':")
    missing_county = cahps_df.filter(pl.col('county').is_null())
    print(missing_county.select(['facility_name', 'city', 'state', 'county']))

Missing values before renaming:
'county': 1478
'countyparish': 3567

Missing values after processing:
'county': 0

No rows were dropped. Row count remains 5045.


In [19]:
cahps_df = cahps_df.with_columns(
    pl.when(pl.col("profit_or_nonprofit").cast(pl.Utf8).str.to_lowercase() == "1")
    .then(pl.lit("profit"))
    .when(pl.col("profit_or_nonprofit").cast(pl.Utf8).str.to_lowercase() == "2")
    .then(pl.lit("non_profit"))
    .when(pl.col("profit_or_nonprofit").cast(pl.Utf8).str.to_lowercase().str.contains("non-profit"))
    .then(pl.lit("non_profit"))
    .when(pl.col("profit_or_nonprofit").cast(pl.Utf8).str.to_lowercase().str.contains("nonprofit"))
    .then(pl.lit("non_profit"))
    .when(pl.col("profit_or_nonprofit") == "")
    .then(pl.lit(None))
    .otherwise(pl.col("profit_or_nonprofit").cast(pl.Utf8).str.to_lowercase().str.replace(" ", "_"))
    .alias("profit_or_nonprofit")
)

# Cleaning values in chain_owned column

cahps_df = cahps_df.with_columns(
    pl.when(pl.col("chain_owned") == "Y").then(pl.lit("yes"))
    .when(pl.col("chain_owned") == "N").then(pl.lit("no"))
    .when(pl.col("chain_owned") == "")
    .then(pl.lit(None))
    .otherwise(pl.col("chain_owned").str.to_lowercase().str.replace(" ", "_"))
    .alias("chain_owned")
)

# Changing Y and N, Yes and No to yes and no throughout cahps_df

cahps_df = cahps_df.with_columns(pl.col(pl.String).str.replace(r"^Y$", "yes"))

cahps_df = cahps_df.with_columns(pl.col(pl.String).str.replace(r"^N$", "no"))

cahps_df = cahps_df.with_columns(pl.col(pl.String).str.replace(r"Yes", "yes"))

cahps_df = cahps_df.with_columns(pl.col(pl.String).str.replace(r"No", "no"))

# Casting provider_number to int and then to string removes leading 0s

cahps_df = (cahps_df.with_columns(
    pl.col("provider_number").cast(pl.Int32)).
with_columns(
    pl.col("provider_number").cast(pl.Utf8))
)

# Fixing erroneous chain_organization values for a haldful of Davita and Satellite Healthcare clinics

cahps_df = cahps_df.with_columns(
    pl.when(pl.col("chain_organization").str.to_lowercase().str.contains("dallas"))
    .then(pl.lit("davita"))
    .when(pl.col("chain_organization").str.to_lowercase().str.contains("satellite"))
    .then(pl.lit("satellitehealthcare"))
    .otherwise(pl.col("chain_organization"))
    .alias("chain_organization")
)

In [20]:
# Select columns of interest
cahps_df = cahps_df.select([
    'year',
    'provider_number',
    'network',
    'facility_name',
    'address_line_1',
    'address_line_2',
    'city',
    'state',
    'zip_code',
    'county',
    'profit_or_nonprofit',
    'phone_number',
    'chain_owned',
    'chain_organization',
    'ichcahps_date',
    'ichcahps_data_availability_code',
    'linearized_score_of_nephrologists_communication_and_caring', 
    'star_rating_of_nephrologists_communication_and_caring',
    'linearized_score_of_quality_of_dialysis_center_care_and_operations',
    'star_rating_of_quality_of_dialysis_center_care_and_operations',
    'linearized_score_of_providing_information_to_patients',
    'star_rating_of_providing_information_to_patients',
    'linearized_score_of_rating_of_the_nephrologist',
    'star_rating_of_the_nephrologist',
    'linearized_score_of_rating_of_the_dialysis_center_staff', 
    'star_rating_of_the_dialysis_center_staff',
    'linearized_score_of_rating_of_the_dialysis_facility',
    'star_rating_of_the_dialysis_facility',
    'total_number_of_completed_interviews_from_the_fall_and_spring_surveys',
    'ich_cahps_survey_of_patients_experiences_star_rating',
    'survey_response_rate',
    'ichcahps_survey_response_rate',
    'ich_cahps_quality_of_patient_care_star_rating'
])



In [21]:
# Coalescing columns with mismatched names

facility_df = facility_df.with_columns(pl.coalesce('city', 'citytown').alias('city'))
facility_df = facility_df.with_columns(pl.coalesce('county', 'countyparish').alias('county'))
facility_df = facility_df.with_columns(pl.coalesce('zip_code', 'zip').alias('zip_code'))
facility_df = facility_df.with_columns(pl.coalesce('phone_number', 'telephone_number').alias('phone_number'))
facility_df = facility_df.with_columns(pl.coalesce('hospitalization_rate_facility_', 'hospitalization_rate_facility').alias('hospitalization_rate_facility'))
facility_df = facility_df.with_columns(pl.coalesce('crownweb__date_', 'crownweb_date').alias('crownweb_date'))
facility_df = facility_df.with_columns(pl.coalesce('number_of_patientmonths_in_serum_phosphorus_summary_', 'number_of_patientmonths_in_serum_phosphorus_summary').alias('number_of_patientmonths_in_serum_phosphorus_summary'))
facility_df = facility_df.with_columns(pl.coalesce('serum_phosphorus_data_availability_code_', 'serum_phosphorus_data_availability_code').alias('serum_phosphorus_data_availability_code'))
facility_df = facility_df.with_columns(pl.coalesce('standard_infection_ratio_', 'standard_infection_ratio').alias('standard_infection_ratio'))
facility_df = facility_df.with_columns(pl.coalesce('hospitalization_rate_facility_', 'hospitalization_rate_facility').alias('hospitalization_rate_facility'))
facility_df = facility_df.with_columns(pl.coalesce('readmission_rate_facility', 'readmission_rate_facility_').alias('readmission_rate_facility'))
facility_df = facility_df.with_columns(pl.coalesce('patient_hospital_readmission_category', 'patient_hospital_readmission_category_text').alias('patient_hospital_readmission_category'))
# Dropping columns with mismatched names

facility_df = facility_df.drop('patient_hospital_readmission_category_text','readmission_rate_facility_','hospitalization_rate_facility_','standard_infection_ratio_', 'number_of_patientmonths_in_serum_phosphorus_summary_', 'crownweb__date_', 'serum_phosphorus_data_availability_code', 'citytown', 'zip', 'telephone_number', 'countyparish')

In [22]:
# Cleaning values in profit_or_nonprofit column

facility_df = facility_df.with_columns(
    pl.when(pl.col("profit_or_nonprofit").cast(pl.Utf8).str.to_lowercase() == "1")
    .then(pl.lit("profit"))
    .when(pl.col("profit_or_nonprofit").cast(pl.Utf8).str.to_lowercase() == "2")
    .then(pl.lit("non_profit"))
    .when(pl.col("profit_or_nonprofit").cast(pl.Utf8).str.to_lowercase().str.contains("non-profit"))
    .then(pl.lit("non_profit"))
    .when(pl.col("profit_or_nonprofit").cast(pl.Utf8).str.to_lowercase().str.contains("nonprofit"))
    .then(pl.lit("non_profit"))
    .when(pl.col("profit_or_nonprofit") == "")
    .then(pl.lit(None))
    .otherwise(pl.col("profit_or_nonprofit").cast(pl.Utf8).str.to_lowercase().str.replace(" ", "_"))
    .alias("profit_or_nonprofit")
)

# Cleaning values in chain_owned column

facility_df = facility_df.with_columns(
    pl.when(pl.col("chain_owned") == "Y").then(pl.lit("yes"))
    .when(pl.col("chain_owned") == "N").then(pl.lit("no"))
    .when(pl.col("chain_owned") == "")
    .then(pl.lit(None))
    .otherwise(pl.col("chain_owned").str.to_lowercase().str.replace(" ", "_"))
    .alias("chain_owned")
)


# Changing Y and N, Yes and No to yes and no throughout facility_df

facility_df = facility_df.with_columns(pl.col(pl.String).str.replace(r"^Y$", "yes"))

facility_df = facility_df.with_columns(pl.col(pl.String).str.replace(r"^N$", "no"))

facility_df = facility_df.with_columns(pl.col(pl.String).str.replace(r"Yes", "yes"))

facility_df = facility_df.with_columns(pl.col(pl.String).str.replace(r"No", "no"))

# Casting provider_number to int and then to string removes leading 0s

facility_df = (facility_df.with_columns(
    pl.col("provider_number").cast(pl.Int32)).
with_columns(
    pl.col("provider_number").cast(pl.Utf8))
)

# Fixing erroneous chain_organization values for a haldful of Davita and Satellite Healthcare clinics
facility_df = facility_df.with_columns(
    pl.when(pl.col("chain_organization").str.to_lowercase().str.contains("dallas"))
    .then(pl.lit("davita"))
    .when(pl.col("chain_organization").str.to_lowercase().str.contains("satellite"))
    .then(pl.lit("satellitehealthcare"))
    .otherwise(pl.col("chain_organization"))
    .alias("chain_organization")
)

In [23]:
# Generating a timestamp for the files
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Defining output paths without timestamp to replace the prior version
facility_output_path = '../../003_data/002_clean-data/national_cms_dialysis-facility_data.parquet'
cahps_output_path = '../../003_data/002_clean-data/national_cms_dialysis-facility_cahps-data.parquet'

# Function to save Polars DataFrame as Parquet with metadata
def save_polars_parquet_with_metadata(df, output_path, description):
    try:
        # Convert Polars DataFrame to Arrow Table
        arrow_table = df.to_arrow()

        # Get existing metadata
        metadata = arrow_table.schema.metadata if arrow_table.schema.metadata else {}

        # Update metadata
        metadata.update({
            b'created_at': str(datetime.now()).encode('utf-8'),
            b'description': description.encode('utf-8'),
            b'version': b'1.0',
            b'cleaning_steps': b'''
                1. Filtered data for California.
                2. Merged duplicate columns (e.g., city/citytown, county/countyparish).
                3. Renamed columns for consistency (e.g., patientsrating to patients_rating).
                4. Dropped redundant columns.
                5. Standardized values in profit_or_nonprofit and chain_owned columns.
                6. Converted Y/N and Yes/No to lowercase yes/no throughout the dataset.
                7. Fixed erroneous chain_organization values for a handful of Davita and Satellite Healthcare clinics.
                8. Casting provider_number to int and then to string removes leading 0s.
                9. Selected relevant columns for the final dataset.
                10. Coalesced columns with mismatched names.
                11. Cleaned and standardized categorical variables.
            '''
        })

        # Creating a new Arrow Table with updated metadata
        updated_table = arrow_table.replace_schema_metadata(metadata)

        # Writing the updated table to a Parquet file with Snappy compression to reduce file size
        pq.write_table(updated_table, output_path, compression='snappy')

        print(f"Data saved to {output_path}")
    except Exception as e:
        print(f"Error saving data: {e}")
        raise

# Saving the merged dataframes as parquet files
save_polars_parquet_with_metadata(facility_df, facility_output_path, "Cleaned CMS dialysis facility data")
save_polars_parquet_with_metadata(cahps_df, cahps_output_path, "Cleaned CMS dialysis facility CAHPS data")

Data saved to ../../003_data/002_clean-data/national_cms_dialysis-facility_data.parquet
Data saved to ../../003_data/002_clean-data/national_cms_dialysis-facility_cahps-data.parquet
