In [4]:
import polars as pl
from datetime import datetime
import pyarrow.parquet as pq


In [19]:
cahps_df = pl.read_parquet('../../003_data/002_clean-data/national_cms_dialysis-facility_cahps-data.parquet')
facility_df = pl.read_parquet('../../003_data/002_clean-data/national_cms_dialysis-facility_data.parquet')
ballot_measures_df = pl.read_parquet('../../003_data/001_raw-data/2018-2022_ballot-measure_sub-county_data.parquet')
supp_facility_df = pl.read_parquet('../../003_data/001_raw-data/2013-2023_CHHS_dialysis-facility_data.parquet')


In [20]:
# Merging the cahps and facility data

merged_cms_df = facility_df.join(
    cahps_df,
    on=['provider_number', 'year'],
    how='left'
)

# Dropping the right-side columns, which are duplicates
merged_cms_df = merged_cms_df.drop(merged_cms_df.select(pl.col("^.*_right$")).columns)


In [22]:
merged_cms_df

(5684, 173)

In [9]:
# Casting year columns to string to match the format in the merged cms dataframe

supp_facility_df = supp_facility_df.with_columns(pl.col("year").cast(pl.Utf8))

ballot_measures_df = ballot_measures_df.with_columns(pl.col("year").cast(pl.Utf8))


In [None]:
ballot_measures_df

In [74]:
# In this cell we're lowercasing all county and city names in the CMS dataframes
# This will allow us to join the combined dataframe with the supplemental dataframe using county and city
# It also serves as an extra data validation step. At this point, there shouldn't be any city or county names that are not 

# Lowercasing all county and city names in the CMS dataframes

merged_cms_df = (
    merged_cms_df
    .with_columns(
        pl.col("county")
        .str.to_lowercase()
        .alias("county"),
        pl.col("city")
        .str.to_lowercase()
        .alias("city")
    )
)

# Lowercasing all county and sub-county names in the SOS ballot measures dataframe

ballot_measures_df = (
    ballot_measures_df
    .with_columns(
        pl.col("county")
        .str.to_lowercase()
        .alias("county"),
        pl.col("sub_county")
        .str.to_lowercase()
        .alias("sub_county")
    )
)

In [None]:
# Here we take an extra data validation step: checking for unique counties in the datasets
# At this point, there shouldn't be any county names that are unique to any dataframe


# Getting unique counties from all dataframes
ballot_counties = set(ballot_measures_df['county'].unique())
facility_counties = set(facility_df['county'].unique())
cahps_counties = set(cahps_df['county'].unique())

print("Unique counties in ballot_measures_df:")
print(sorted(ballot_counties))

print("\nUnique counties in facility_df:")
print(sorted(facility_counties))

print("\nUnique counties in cahps_df:")
print(sorted(cahps_counties))

# Finding counties that are in one dataframe but not others
counties_only_in_ballot = ballot_counties - facility_counties - cahps_counties
counties_only_in_facility = facility_counties - ballot_counties - cahps_counties
counties_only_in_cahps = cahps_counties - ballot_counties - facility_counties

print("\nCounties only in ballot_measures_df:")
print(sorted(counties_only_in_ballot))

print("\nCounties only in facility_df:")
print(sorted(counties_only_in_facility))

print("\nCounties only in cahps_df:")
print(sorted(counties_only_in_cahps))

# Checking for potential mismatches due to formatting
all_counties = ballot_counties.union(facility_counties).union(cahps_counties)
potential_mismatches = [county for county in all_counties if any(
    county != other_county and county in other_county
    for other_county in all_counties
)]

print("\nPotential county name mismatches:")
print(sorted(potential_mismatches))

In [76]:
# Filtering the supplemental facility information dataframe to include on dialysis clinics

supp_facility_df = supp_facility_df.filter(pl.col("LIC_CAT") == "Chronic Dialysis Clinic")

In [77]:
# Selecting facility ids and geographic features to merge with CMS data
# Stripping extraneous numbers from facility ids to match the id format in CMS data
# Renaming FAC_NO provider_number to match the id format in CMS data

supp_facility_df = (supp_facility_df
 .with_columns(
     pl.col("FAC_NO")
     .alias("provider_number"))
 .with_columns(
     pl.col("COUNTY")
     .alias("county"))
 .with_columns(
     pl.col("FAC_NAME")
     .alias("facility_name"))
 .with_columns(
     pl.col("FAC_ZIP")
     .alias("zip_code"))
 .select([
     "year",
    "provider_number",
    "facility_name",
    "county",
    "zip_code",
    "ASSEMBLY_DIST",
    "SENATE_DIST",
    "CONGRESS_DIST",
    "CENS_TRACT",
    "LONGITUDE",
    "LATITUDE"
    ])
)

In [78]:
# Merging datasets using standardized facility names

def standardize_facility(df):
    df = df.with_columns(
        pl.col('facility_name')
        .str.to_lowercase()
        .str.strip_chars()
        .str.replace_all(r'[^a-z0-9]', '')  # Remove all non-alphanumeric characters
        .alias('std_facility_name')
    )
    
    return df

supp_facility_df = standardize_facility(supp_facility_df)
merged_cms_df = standardize_facility(merged_cms_df)

# Standardizing chain organization names to ensure consistency across time
# There's also some inconsistency with chain organiztion names including "inc"

def standardize_chain_organization(df):
    df = df.with_columns(
        pl.col('chain_organization')
        .str.to_lowercase()
        .str.replace_all(r'\binc\b', '')  # Removes 'inc' as a whole word
        .str.replace_all(r'[^a-z0-9]', '')  # Removes all non-alphanumeric characters
        .str.strip_chars()  # Removes leading/trailing whitespace
        .alias('std_chain_organization')
    )
    
    return df

merged_cms_df = standardize_chain_organization(merged_cms_df)

# Here we're using multiple columns for matching
# This is important because some facilities are essentially chains, so they have the same name regardless of zip code
# We also tried using a fuzzy merge to match the facility names, but decided that having an accurate merge was more important than using a more complicated, but  matching algorithm


supplemented_cms_df = merged_cms_df.join(
    supp_facility_df,
    on=['std_facility_name', 'zip_code'],
    how='left'
)


In [None]:
# The three districts with the fewest dialysis facilities are 63, 23, and 67
# 63 is in a mountainous part of sacramento, 67 is in a mountainous part of santa clara, and 23 is a land area occupied mostly by diseneyland

# The three districts with the most dialysis facilities are 66, 57, and 48
# All are in the LA area

supplemented_cms_df.filter(pl.col("ASSEMBLY_DIST").is_not_null()).group_by("ASSEMBLY_DIST").len().sort("len", descending=True)


In [80]:
# Reshaping the ballot measures dataframe to merge with CMS cahps and facility data
# We can merge city-level votes directly with the original cahps and facility data

ballot_measures_df_city = (ballot_measures_df.filter(pl.col("geo_type") == "city")
 .pivot(
    index=['county', 'year', 'sub_county', 'vote_type'],
    on='geo_type',
    values='vote_count'
).with_columns(
    pl.col("city")
    .cast(pl.Int64)
    .alias("vote_count"),
    pl.col("sub_county")
    .alias("city")
)
.drop("sub_county")
)

# Reshaping the ballot measures dataframe to merge with CMS cahps and facility data
# We can merge Assembly-District-level votes with the cahps and facility data after
# they've been merged with the supplemental California Health and Human Services data

ballot_measures_df_assembly_district = (ballot_measures_df.filter(pl.col("geo_type") == "state_assembly_district")
 .pivot(
    index=['year', 'county', 'district_id', 'vote_type'],
    on='geo_type',
    values='vote_count')
 .with_columns(
    pl.col("state_assembly_district")
    .cast(pl.Int64)
    .alias("vote_count"),
    pl.col("district_id")
    .cast(pl.Float64)
    .alias("ASSEMBLY_DIST")
)
.drop("state_assembly_district")
)

In [81]:
# Creating a list of variables to select from the cahps dataframe
# These will be used for analysis

cms_variables_of_interest_by_city = [
    "year",
    "county",
    "city",
    "provider_number",
    "profit_or_nonprofit",
    "std_chain_organization",
    "ichcahps_data_availability_code",
    "five_star_data_availability_code",
    "linearized_score_of_rating_of_the_dialysis_facility",
    "five_star",
    "ich_cahps_survey_of_patients_experiences_star_rating",
    "linearized_score_of_rating_of_the_dialysis_center_staff",
    "patient_transplant_waitlist_data_availability_code",
    "standardized_first_kidney_transplant_waitlist_ratio", #higher is better
    "_of_dialysis_stations",
    "patient_hospital_readmission_category",
    "mortality_rate_facility"
]

# Merging the cahps dataframe with city-level votes

merged_cms_df_with_votes_by_city = (
    merged_cms_df
    .select(pl.col(cms_variables_of_interest_by_city))
    .join(
        ballot_measures_df_city,
        on=['year','county', 'city'],
        how='left'
    )
)

In [82]:
# Creating a list of variables to select from the cahps dataframe
# These will be used for analysis

cms_variables_of_interest_by_assembly_district = [
    "year",
    "county",
    "city",
    "provider_number",
    "ASSEMBLY_DIST",
    "CENS_TRACT",
    "LONGITUDE",
    "LATITUDE",
    "profit_or_nonprofit",
    "std_chain_organization",
    "ichcahps_data_availability_code",
    "five_star_data_availability_code",
    "linearized_score_of_rating_of_the_dialysis_facility",
    "five_star",
    "ich_cahps_survey_of_patients_experiences_star_rating",
    "linearized_score_of_rating_of_the_dialysis_center_staff",
    "patient_transplant_waitlist_data_availability_code",
    "standardized_first_kidney_transplant_waitlist_ratio", #higher is better
    "_of_dialysis_stations",
    "patient_hospital_readmission_category",
    "mortality_rate_facility"
]

# Merging the cahps dataframe with Assembly-District-level votes

supplemented_cms_df_with_votes_by_assembly_district = (
    supplemented_cms_df
    .select(pl.col(cms_variables_of_interest_by_assembly_district))
    .filter(pl.col("ASSEMBLY_DIST").is_not_null())
    .join(
        ballot_measures_df_assembly_district,
        on=['year','county', 'ASSEMBLY_DIST'],
        how='left'
    )
)

In [None]:
# Function to save Polars DataFrame as Parquet with metadata
def save_polars_parquet_with_metadata(df, output_path, description):
    try:
        # Convert Polars DataFrame to Arrow Table
        arrow_table = df.to_arrow()

        # Get existing metadata
        metadata = arrow_table.schema.metadata if arrow_table.schema.metadata else {}

        # Update metadata
        metadata.update({
            b'created_at': str(datetime.now()).encode('utf-8'),
            b'description': description.encode('utf-8'),
            b'version': b'1.0',
            b'cleaning_steps': b'''
                1. Imported and read necessary datasets (CMS CAHPS, CMS facility, ballot measures, and supplemental facility data).
                2. Merged CMS CAHPS and facility data.
                3. Standardized data types (e.g., casting year columns to string).
                4. Lowercased county and city names across all datasets for consistency.
                5. Performed data validation by checking for unique counties across datasets.
                6. Filtered supplemental facility data to include only chronic dialysis clinics.
                7. Standardized facility and chain organization names across datasets for accurate merging.
                8. Merged CMS data with supplemental facility data using standardized facility names and zip codes.
                9. Reshaped ballot measures data for city-level and assembly-district-level analysis.
                10. Created separate datasets for city-level and assembly-district-level analysis by merging with relevant ballot measure data.
                11. Selected variables of interest for each analysis type (e.g., city-level and assembly-district-level).
            '''
        })

        # Creating a new Arrow Table with updated metadata
        updated_table = arrow_table.replace_schema_metadata(metadata)

        # Writing the updated table to a Parquet file with Snappy compression to reduce file size
        pq.write_table(updated_table, output_path, compression='snappy')

        print(f"Data saved to {output_path}")
    except Exception as e:
        print(f"Error saving data: {e}")
        raise

# Defining output paths
city_output_path = '../../003_data/003_merged-data/merged_cms_ballot-measures_by-city.parquet'
assembly_output_path = '../../003_data/003_merged-data/merged_cms_ballot-measures_by-assembly-district.parquet'

# Saving the merged dataframes as parquet files
save_polars_parquet_with_metadata(
    merged_cms_df_with_votes_by_city, 
    city_output_path, 
    "Merged CMS and ballot measure data by city"
)

save_polars_parquet_with_metadata(
    supplemented_cms_df_with_votes_by_assembly_district, 
    assembly_output_path, 
    "Merged CMS and ballot measure data by assembly district"
)