In [268]:
import polars as pl

In [269]:
cahps_df = pl.read_parquet('../../003_data/002_clean-data/national_cms_dialysis-facility_cahps-data.parquet')
facility_df = pl.read_parquet('../../003_data/002_clean-data/national_cms_dialysis-facility_data.parquet')
ballot_measures_df = pl.read_parquet('../../003_data/001_raw-data/2018-2022_ballot-measure_sub-county_data.parquet')
supp_facility_df = pl.read_parquet('../../003_data/001_raw-data/2013-2023_CHHS_dialysis-facility_data.parquet')


In [270]:
# Casting year columns to string to match the format in the other dataframes

supp_facility_df = supp_facility_df.with_columns(pl.col("year").cast(pl.Utf8))

ballot_measures_df = ballot_measures_df.with_columns(pl.col("year").cast(pl.Utf8))


In [273]:
ballot_measures_df

year,county,sub_county,district_id,vote_type,vote_count
str,str,str,str,str,str
"""2022""","""Alameda""","""county_supervisorial_district""","""1""","""yes""","""31161"""
"""2022""","""Alameda""","""county_supervisorial_district""","""2""","""yes""","""28778"""
"""2022""","""Alameda""","""county_supervisorial_district""","""3""","""yes""","""34338"""
"""2022""","""Alameda""","""county_supervisorial_district""","""4""","""yes""","""33897"""
"""2022""","""Alameda""","""county_supervisorial_district""","""5""","""yes""","""54523"""
…,…,…,…,…,…
"""2018""","""Yuba""","""state_assembly_district""","""3""","""no""","""12845"""
"""2018""","""Yuba""","""state_assembly_district""","""1""","""no""","""12845"""
"""2018""","""Yuba""","""city""",,"""no""","""1857"""
"""2018""","""Yuba""","""city""",,"""no""","""591"""


In [274]:
# The counties unique to the ballot measures dataframe are all in rural areas
# This tells us dialysis facilites aren't located in more rural parts of california

# Lowercasing all county and city names in the CMS dataframes

facility_df = (
    facility_df
    .with_columns(
        pl.col("county")
        .str.to_lowercase()
        .alias("county"),
        pl.col("city")
        .str.to_lowercase()
        .alias("city")
    )
)

cahps_df = (
    cahps_df
    .with_columns(
        pl.col("county")
        .str.to_lowercase()
        .alias("county"),
        pl.col("city")
        .str.to_lowercase()
        .alias("city")
    )
)

# Lowercasing all county names in the SOS ballot measures dataframe

ballot_measures_df = (
    ballot_measures_df
    .with_columns(
        pl.col("county")
        .str.to_lowercase()
        .alias("county")
    )
)

# Getting unique counties from all dataframes
ballot_counties = set(ballot_measures_df['county'].unique())
facility_counties = set(facility_df['county'].unique())
cahps_counties = set(cahps_df['county'].unique())

print("Unique counties in ballot_measures_df:")
print(sorted(ballot_counties))

print("\nUnique counties in facility_df:")
print(sorted(facility_counties))

print("\nUnique counties in cahps_df:")
print(sorted(cahps_counties))

# Finding counties that are in one dataframe but not others
counties_only_in_ballot = ballot_counties - facility_counties - cahps_counties
counties_only_in_facility = facility_counties - ballot_counties - cahps_counties
counties_only_in_cahps = cahps_counties - ballot_counties - facility_counties

print("\nCounties only in ballot_measures_df:")
print(sorted(counties_only_in_ballot))

print("\nCounties only in facility_df:")
print(sorted(counties_only_in_facility))

print("\nCounties only in cahps_df:")
print(sorted(counties_only_in_cahps))

# Checking for potential mismatches due to formatting
all_counties = ballot_counties.union(facility_counties).union(cahps_counties)
potential_mismatches = [county for county in all_counties if any(
    county != other_county and county in other_county
    for other_county in all_counties
)]

print("\nPotential county name mismatches:")
print(sorted(potential_mismatches))

Unique counties in ballot_measures_df:
['alameda', 'alpine', 'amador', 'butte', 'calaveras', 'colusa', 'contra costa', 'del norte', 'el dorado', 'fresno', 'glenn', 'humboldt', 'imperial', 'inyo', 'kern', 'kings', 'lake', 'lassen', 'los angeles', 'madera', 'marin', 'mariposa', 'mendocino', 'merced', 'modoc', 'mono', 'monterey', 'napa', 'nevada', 'orange', 'placer', 'plumas', 'riverside', 'sacramento', 'san benito', 'san bernardino', 'san diego', 'san francisco', 'san joaquin', 'san luis obispo', 'san mateo', 'santa barbara', 'santa clara', 'santa cruz', 'shasta', 'sierra', 'siskiyou', 'solano', 'sonoma', 'stanislaus', 'sutter', 'tehama', 'trinity', 'tulare', 'tuolumne', 'ventura', 'yolo', 'yuba']

Unique counties in facility_df:
['', 'alameda', 'butte', 'columbia', 'colusa', 'contra costa', 'del norte', 'el dorado', 'fresno', 'humboldt', 'imperial', 'inyo', 'kern', 'kings', 'lake', 'los angeles', 'madera', 'marin', 'mendocino', 'merced', 'monterey', 'napa', 'nevada', 'orange', 'placer',

In [275]:
# Filtering the supplemental facility information dataframe to include on dialysis clinics

supp_facility_df = supp_facility_df.filter(pl.col("LIC_CAT") == "Chronic Dialysis Clinic")

In [276]:
# Selecting facility ids and geographic features to merge with CMS data
# Stripping extraneous numbers from facility ids to match the id format in CMS data
# Renaming FAC_NO provider_number to match the id format in CMS data

supp_facility_df = (supp_facility_df
 .with_columns(
     pl.col("FAC_NO")
     .alias("provider_number"))
 .with_columns(
     pl.col("COUNTY")
     .alias("county"))
 .with_columns(
     pl.col("FAC_NAME")
     .alias("facility_name"))
 .with_columns(
     pl.col("FAC_ZIP")
     .alias("zip_code"))
 .select([
     "year",
    "provider_number",
    "facility_name",
    "county",
    "zip_code",
    "ASSEMBLY_DIST",
    "SENATE_DIST",
    "CONGRESS_DIST",
    "CENS_TRACT",
    "LONGITUDE",
    "LATITUDE"
    ])
)

In [277]:
merged_df = cahps_df.join(
    supp_facility_df,
    on="facility_name",
    how="left"
)

print(merged_df.shape)
print(cahps_df.shape)

(7898, 43)
(5045, 33)


In [278]:
# Merging datasets using standardized facility names

def standardize_facility(df):
    
    df = df.with_columns(
        pl.col('facility_name')
        .str.to_lowercase()
        .str.replace_all(' ', '')
        .str.replace_all('-', '')
        .str.replace_all(',', '')
        .str.replace_all('.', '')
        .alias('std_facility_name')
    )

    return df

supp_facility_df = standardize_facility(supp_facility_df)
cahps_df = standardize_facility(cahps_df)
facility_df = standardize_facility(facility_df)

# Here we're using multiple columns for matching
# This is important because some facilities are essentially chains, so they have the same name regardless of zip code
# I also tried using a fuzzy merge to match the facility names, but decided that having an accurate merge was more important than using a more complicated, but  matching algorithm

merged_cahps_df = cahps_df.join(
    supp_facility_df,
    on=['std_facility_name', 'zip_code'],
    how='left'
)

merged_facility_df = facility_df.join(
    supp_facility_df,
    on=['std_facility_name', 'zip_code'],
    how='left'
)

In [279]:
# Comparing approaches
print(f"Multi-column merge shape: {merged_cahps_df.shape}")

print(f"Multi-column merge shape: {merged_facility_df.shape}")

Multi-column merge shape: (49139, 44)
Multi-column merge shape: (55384, 166)


In [280]:
facility_df

provider_number,network,facility_name,five_star_date,five_star,five_star_data_availability_code,address_line_1,address_line_2,state,zip_code,profit_or_nonprofit,chain_owned,chain_organization,late_shift,_of_dialysis_stations,offers_incenter_hemodialysis,offers_peritoneal_dialysis,offers_home_hemodialysis_training,certification_date,claims_date,eqrs_date,smr_date,patient_survival_category_text,patient_survival_data_availability_code,number_of_patients_included_in_survival_summary,mortality_rate_facility,mortality_rate_upper_confidence_limit_975,mortality_rate_lower_confidence_limit_25,shr_date,patient_hospitalization_category_text,patient_hospitalization_data_availability_code,number_of_patients_included_in_hospitalization_summary,hospitalization_rate_upper_confidence_limit_975,hospitalization_rate_lower_confidence_limit_25,srr_date,patient_hospital_readmission_category,patient_hospital_readmission_data_availability_code,…,percentage_of_adult_patients_with_serum_phosphorus_greater_than_70_mgdl,long_term_catheter_data_availability_code,number_of_patients_in_long_term_catheter_summary,number_of_patient_months_in_long_term_catheter_summary,percentage_of_adult_patients_with_long_term_catheter_in_use,npcr_data_availability_code,number_of_patients_in_npcr_summary,number_of_patientmonths_in_npcr_summary,percentage_of_pediatric_hd_patients_with_npcr,year,month,city,county,phone_number,certification_or_recertification_date,crownweb_date,serum_phosphorus_data_availability_code_,dateswr,offers_incenter_peritoneal_dialysis,rate_of_hospital_readmission_category_text,percentage_of_medicare_patients_with_hgb_10_gdl,number_of_patients_included_in_transfusion_summary,standard_infection_ratio_,percentage_of_adult_hd_patients_with_ktv_12,percentage_of_adult_pd_patients_with_ktv17,percentage_of_pediatric_hd_patents_with_ktv12,percentage_of_pediatric_pd_patents_with_ktv18,number_of_adult_patients_included_in_arterial_venous_fistula_and_catheter_summaries,number_of_adult_patientmonths_included_in_arterial_venous_fistula_and_catheter_summaries,arteriovenous_fistulae_in_use_data_availability_code,percentage_of_patients_with_arteriovenous_fistulae_in_use,vascular_catheter_data_availability_code,percentage_of_patients_with_vascular_catheter_in_use_for_90_days_or_longer,hospitalization_rate_facility_,patient_hospital_readmission_category_text,readmission_rate_facility_,std_facility_name
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""52305""","""17""","""SANTA CLARA VALLEY RENAL CARE …","""01Jan2019-31Dec2022""","""3""","""1""","""2220 MOORPARK AVENUE""",,"""CA""","""95128""","""non-profit""","""no""","""Independent""","""yes""","""25""","""yes""","""yes""","""no""","""22AUG1977""","""01OCT2022-30SEP2023""","""01OCT2022-30SEP2023""","""01Jan2019-31Dec2022""","""As Expected""","""1""","""529""","""19.8""","""30.5""","""13.3""","""01Jan2022-31Dec2022""","""As Expected""","""1""","""117""","""205.7""","""97.9""","""01Jan2022-31Dec2022""","""As Expected""","""1""",…,"""11""","""1""","""192""","""1771""","""22""","""259""","""0""",,,"""2024""","""7""","""san jose""","""santa clara""","""(408) 885-5730""",,,,,,,,,,,,,,,,,,,,,,,""""""
"""52311""","""18""","""St. Joseph Hospital Renal Cent…","""01Jan2019-31Dec2022""","""3""","""1""","""Sr. Elizabeth Bldg. 1100 W. St…",,"""CA""","""92868""","""non-profit""","""no""","""Independent""","""yes""","""39""","""yes""","""yes""","""yes""","""15AUG1977""","""01OCT2022-30SEP2023""","""01OCT2022-30SEP2023""","""01Jan2019-31Dec2022""","""As Expected""","""1""","""533""","""22.9""","""33.7""","""16.1""","""01Jan2022-31Dec2022""","""Better than Expected""","""1""","""111""","""129.0""","""49.7""","""01Jan2022-31Dec2022""","""As Expected""","""1""",…,"""15""","""1""","""161""","""1437""","""14""","""1""","""14""","""78""","""97""","""2024""","""7""","""orange""","""orange""","""(714) 771-8037""",,,,,,,,,,,,,,,,,,,,,,,""""""
"""52321""","""18""","""Childrens Hospital of Los Ange…","""01Jan2019-31Dec2022""",,"""260""","""Division of Nephrology (Dialys…",,"""CA""","""90027""","""non-profit""","""no""","""Independent""","""no""","""10""","""yes""","""yes""","""yes""","""28JUL1977""","""01OCT2022-30SEP2023""","""01OCT2022-30SEP2023""","""01Jan2019-31Dec2022""","""not Available""","""199""","""54""",,,,"""01Jan2022-31Dec2022""","""not Available""","""199""","""5""",,,"""01Jan2022-31Dec2022""","""As Expected""","""1""",…,,"""199""","""3""","""28""",,"""1""","""54""","""456""","""81""","""2024""","""7""","""los angeles""","""los angeles""","""(323) 361-2560""",,,,,,,,,,,,,,,,,,,,,,,""""""
"""52323""","""18""","""Kaiser Foundation Hospital Med…","""01Jan2019-31Dec2022""","""4""","""1""","""Dialysis Unit 4700 Sunset Blvd…",,"""CA""","""90027""","""non-profit""","""yes""","""Kaiser Permanente""","""yes""","""30""","""yes""","""yes""","""yes""","""25JUL1977""","""01OCT2022-30SEP2023""","""01OCT2022-30SEP2023""","""01Jan2019-31Dec2022""","""As Expected""","""1""","""793""","""18.9""","""27.4""","""13.6""","""01Jan2022-31Dec2022""","""As Expected""","""1""","""183""","""161.6""","""79.3""","""01Jan2022-31Dec2022""","""As Expected""","""1""",…,"""13""","""1""","""133""","""1213""","""18""","""259""","""0""",,,"""2024""","""7""","""los angeles""","""los angeles""","""(323) 783-5580""",,,,,,,,,,,,,,,,,,,,,,,""""""
"""52334""","""18""","""Arrowhead Regional Medical Cen…","""01Jan2019-31Dec2022""","""4""","""1""","""400 N. Pepper Avenue""",,"""CA""","""92324""","""non-profit""","""no""","""Independent""","""no""","""8""","""yes""","""no""","""no""","""28APR2006""","""01OCT2022-30SEP2023""","""01OCT2022-30SEP2023""","""01Jan2019-31Dec2022""","""As Expected""","""1""","""282""","""12.2""","""23.1""","""6.2""","""01Jan2022-31Dec2022""","""As Expected""","""1""","""68""","""165.7""","""52.7""","""01Jan2022-31Dec2022""","""As Expected""","""1""",…,"""9""","""1""","""105""","""1073""","""9""","""259""","""0""",,,"""2024""","""7""","""colton""","""san bernardino""","""(909) 580-3911""",,,,,,,,,,,,,,,,,,,,,,,""""""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""552607""","""17""","""NORTHGATE DIALYSIS CENTER""","""01Jan2012-31Dec2015""","""3""","""1""","""650 LAS GALLINAS ROAD""","""""","""CA""","""94903""","""profit""","""yes""","""DAVITA""","""no""","""12""","""yes""",,"""yes""",,"""01JAN2016-31DEC2016""",,"""01Jan2013-31Dec2016""","""As Expected""","""1""","""422""","""13.9""","""18.2""","""10.5""","""01Jan2016-31Dec2016""","""As Expected""","""1""","""56""","""326.6""","""105.3""","""01Jan2016-31Dec2016""",,"""1""",…,"""9""",,,,,,,,,"""2017""","""10""","""san rafael""","""marin""","""4154440376""","""23-Jan-09""","""01JAN2016-31DEC2016""",,,"""yes""",,"""4""","""52""","""0.24""","""87""","""89""","""""","""""","""48""","""340""","""1""","""67""","""1""","""10""","""181.7""","""As Expected""","""34.2""",""""""
"""552608""","""17""","""CORNERHOUSE DIALYSIS CENTER""","""01Jan2012-31Dec2015""","""2""","""1""","""2005 NAGLEE AVENUE""","""""","""CA""","""95128""","""profit""","""yes""","""DAVITA""","""yes""","""16""","""yes""",,"""no""",,"""01JAN2016-31DEC2016""",,"""01Jan2013-31Dec2016""","""As Expected""","""1""","""529""","""15.4""","""19.8""","""11.9""","""01Jan2016-31Dec2016""","""As Expected""","""1""","""73""","""355.9""","""133.1""","""01Jan2016-31Dec2016""",,"""1""",…,"""13""",,,,,,,,,"""2017""","""10""","""san jose""","""santa clara""","""4089980183""","""26-nov-08""","""01JAN2016-31DEC2016""",,,"""no""",,"""13""","""67""","""0.91""","""86""","""""","""""","""""","""67""","""449""","""1""","""78""","""1""","""12""","""214.7""","""Worse than Expected""","""39.3""",""""""
"""552609""","""18""","""SATELLITE HEALTHCARE ORANGE""","""01Jan2012-31Dec2015""","""5""","""1""","""1518 W. LA VETA AVENUE""","""""","""CA""","""92868""","""non_profit""","""yes""","""SATELLITE HEALTHCARE""","""yes""","""24""","""yes""",,"""no""",,"""01JAN2016-31DEC2016""",,"""01Jan2013-31Dec2016""","""As Expected""","""1""","""699""","""15.1""","""19""","""11.8""","""01Jan2016-31Dec2016""","""As Expected""","""1""","""146""","""206.3""","""80.2""","""01Jan2016-31Dec2016""",,"""1""",…,"""11""",,,,,,,,,"""2017""","""10""","""orange""","""orange""","""7142859675""","""29-Dec-08""","""01JAN2016-31DEC2016""",,,"""yes""",,"""12""","""122""","""0.33""","""98""","""89""","""""","""""","""133""","""1009""","""1""","""77""","""1""","""8""","""123.7""","""As Expected""","""16.3""",""""""
"""552611""","""17""","""WALNUT CREEK AT HOME""","""01Jan2012-31Dec2015""","""3""","""1""","""400 N WIGET LANE""","""""","""CA""","""94598""","""profit""","""yes""","""DAVITA""","""no""","""3""","""yes""",,"""yes""",,"""01JAN2016-31DEC2016""",,"""01Jan2013-31Dec2016""","""As Expected""","""1""","""104""","""13.1""","""28.5""","""4.8""","""01Jan2016-31Dec2016""","""As Expected""","""1""","""17""","""538.4""","""78.1""","""01Jan2016-31Dec2016""",,"""1""",…,"""17""",,,,,,,,,"""2017""","""10""","""walnut creek""","""contra costa""","""9259799732""","""31-Oct-08""","""01JAN2016-31DEC2016""",,,"""yes""",,"""""","""14""","""""","""""","""""","""""","""""","""9""","""82""","""199""","""""","""199""","""""","""197.1""","""As Expected""","""6.9""",""""""
