In [67]:
import pandas as pd
import requests
from io import BytesIO
import re
from datetime import datetime
import pyarrow as pa
import pyarrow.parquet as pq

In [68]:
# Defnining a function to download and combine excel files into a dataframe. 

# Takes as input a list of urls, and a number of rows to skip, which varies by year.

def download_excel_files(urls, rows_to_skip, sheet_number = 1):
    all_data = []

    for i, url in enumerate(urls, 1):
        # Download an Excel file
        response = requests.get(url)
        
        if response.status_code == 200:
            # Read the Excel file into a BytesIO object
            excel_file = BytesIO(response.content)
            
            # Read the Excel file, specifying dtype for CENSUS_KEY and CENS_TRACT
            # Census dtype specification necessary to prevent pandas from converting these to floats
            df = pd.read_excel(
                excel_file, 
                skiprows=rows_to_skip, 
                sheet_name=sheet_number,
                dtype={'CENSUS_KEY': str, 'CENS_TRACT': str}
            )
            
            # Rename 'CENSUS_KEY' to 'CENS_TRACT' if it exists
            if 'CENSUS_KEY' in df.columns:
                df = df.rename(columns={'CENSUS_KEY': 'CENS_TRACT'})
            
            # Add columns to identify which file this data came from
            df['source_file'] = f"file_{i}"
            df['year'] = url.split('/')[-1].split('_')[0][-2:]  # Extract year from filename
            
            all_data.append(df)
            
            print(f"Downloaded and processed file {i}")
        else:
            print(f"Failed to download file from {url}. Status code: {response.status_code}")

    if all_data:
        # Combine all DataFrames into one
        combined_df = pd.concat(all_data, ignore_index=True)
        return combined_df
    else:
        print("No data was successfully downloaded.")
        return None

In [69]:
# Attempted to use beautifulsoup to scrape the specialty care clinic file urls, but neither xpath or css selector worked reliably. 

# I split the urls into 2 lists, pre_2018_urls and post_2018_urls, on account of inconsistent data structure.

# I also define a data dictionary url to create a dataframe that can be used to standardize column names between the newer and older datasets. 

# The plan is to create two dataframes for pre- and post-2018, modify the former's structure to match the latter, and combine.

# Define a list of urls for files pertaining to 2013–2017

pre_2018_urls = [
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/896c699c-07fc-4049-bda0-ff98ac8e3913/download/spcl13utildatafinal.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/91fa31b7-8f40-47f1-8bca-bbc063221993/download/spcl14utildatafinal.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/171f7631-4cb2-4b20-b238-d5ab3512ae10/download/spcl15utildatafinal.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/c6a99713-427a-44df-947d-d46c3402a4d6/download/spcl16_util_data_final-ver2.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/e7a2def1-c0dd-41af-a283-46e095bc0af2/download/spcl17_util_data_final.xlsx"
]

# Define a list of urls for files pertaining to 2018–2023

post_2018_urls = [
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/8ad9b464-cbbd-4ad5-b37d-d2daa924768b/download/spcl23_util_data_prelim.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/00a9d637-d75a-4ba5-9ed5-87bb01f3a6e3/download/spcl22_util_data_final.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/f6339c46-8e35-4466-b972-ce132c43cbf4/download/spcl21_util_data_final.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/9c883633-b661-4da3-b39f-50536f60e573/download/spcl20_util_data_final.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/188b31e3-2307-479e-9bee-632083f902ba/download/spcl19_util_data_final.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/e891cdff-6092-4316-b406-dcbcf4a9c016/download/spcl18_util_data_final.xlsx"
]

data_dictionary_url = ["https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/188b31e3-2307-479e-9bee-632083f902ba/download/spcl19_util_data_final.xlsx"]

In [70]:
pre_2018_df = download_excel_files(pre_2018_urls, rows_to_skip = [1,2,3])

post_2018_df = download_excel_files(post_2018_urls, rows_to_skip = [1,2,3,4])

data_dictionary = download_excel_files(data_dictionary_url, rows_to_skip = 0, sheet_number = 3)

Downloaded and processed file 1
Downloaded and processed file 2
Downloaded and processed file 3
Downloaded and processed file 4
Downloaded and processed file 5
Downloaded and processed file 1
Downloaded and processed file 2
Downloaded and processed file 3
Downloaded and processed file 4
Downloaded and processed file 5
Downloaded and processed file 6


  combined_df = pd.concat(all_data, ignore_index=True)


Downloaded and processed file 1


# Data cleaning to merge the two sets of historical data

In [71]:
pre_2018_df

Unnamed: 0,OSHPD_ID,FAC_NAME,FAC_ADDRESS_ONE,FAC_ADDRESS_TWO,FAC_CITY,FAC_ZIPCODE,FAC_PHONE,FAC_ADMIN_NAME,FAC_OPER_CURRYR,BEG_DATE,...,PROJ_03_PROJTD_CAP_EXP,PROJ_03_OSHPD_PROJ_NO,PROJ_04_DESCRIP_CAP_EXP,PROJ_04_PROJTD_CAP_EXP,PROJ_04_OSHPD_PROJ_NO,PROJ_05_DESCRIP_CAP_EXP,PROJ_05_PROJTD_CAP_EXP,PROJ_05_OSHPD_PROJ_NO,source_file,year
0,306013662.0,UNION CITY DIALYSIS CENTER,32930 ALVARADO NILES ROAD NO.300,,UNION CITY,94587,253-733-4847,Vicki Kertzman,Yes,2013-01-01,...,,,,,,,,,file_1,sx
1,306013683.0,BERKELEY DIALYSIS,2920 TELEGRAPH AVENUE,,BERKELEY,94705,253-733-4847,Vicki Kertzman,No,2013-01-01,...,,,,,,,,,file_1,sx
2,306104002.0,ALLIANT INTERNATIONAL UNIVERSITY,5130 EAST CLINTON WAY,,FRESNO,93727,559-253-2277,"Robert N. Harris, Ph.D.",Yes,2013-01-01,...,,,,,,,,,file_1,sx
3,306121019.0,FRESENIUS MEDICAL CARE OF EUREKA,2765 TIMBER RIDGE LANE,,EUREKA,95501,707-445-2033,Clarke Sabandal,Yes,2013-01-01,...,,,,,,,,,file_1,sx
4,306134002.0,EL CENTRO DESERT VALLEY DIALYSIS CENTER,110 SO. FIFTH STREET,,EL CENTRO,92243,760-353-0353,Ana Charves,Yes,2013-01-01,...,,,,,,,,,file_1,sx
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3074,306190916.0,ROWLAND HEIGHTS DIALYSIS,17875 COLIMA RD,STE A,CITY OF INDUSTRY,91748,626-964-5849,Katherine Crespo,No,2017-01-01,...,,,,,,,,,file_5,17
3075,306334726.0,FRESENIUS MEDICAL CARE MORENO VALLEY,27420 IRIS AVE.,,MORENO VALLEY,92555,951-242-9196,JASON BAUER,Yes,2017-01-01,...,,,,,,,,,file_5,17
3076,306196095.0,FMC DIALYSIS SERVICES OF WEST COVINA,1540 W WEST COVINA PKWY,,WEST COVINA,91790,626-337-8007,ANIL VAIDYA,Yes,2017-01-01,...,,,,,,,,,file_5,17
3077,306304303.0,RAI - LAGUNA CANYON - IRVINE,16255 LAGUNA CANYON RD,,IRVINE,92618-3603,949-727-4495,JASON BAUER,Yes,2017-01-01,...,,,,,,,,,file_5,17


In [72]:
post_2018_df

Unnamed: 0,Description,FAC_NO,FAC_NAME,FAC_STR_ADDR,FAC_CITY,FAC_ZIP,FAC_PHONE,FAC_ADMIN_NAME,FAC_OPERATED_THIS_YR,FAC_OP_PER_BEGIN_DT,...,MEANS_FOR_ACQUISITION_08,MEANS_FOR_ACQUISITION_09,MEANS_FOR_ACQUISITION_10,source_file,year,OSHPD_PROJ_NO_01,OSHPD_PROJ_NO_02,OSHPD_PROJ_NO_03,OSHPD_PROJ_NO_04,OSHPD_PROJ_NO_05
0,,306010568.0,FRESENIUS KIDNEY CARE UNION CITY,"1320 DECOTO RD, SUITE 100",UNION CITY,94587,5104042511,Rocko Graziano,Yes,2023-01-01,...,,,,file_1,23,,,,,
1,,306010583.0,OAKLAND LAUREL DIALYSIS,"3814 MACARTHUR BLVD, STE 201",OAKLAND,94619,2537334847,Vicki Kertzman,Yes,2023-01-01,...,,,,file_1,23,,,,,
2,,306010617.0,CASTRO VALLEY DIALYSIS,20359 LAKE CHABOT RD.,CASTRO VALLEY,94546,2537334847,Vicki Kertzman,Yes,2023-01-01,...,,,,file_1,23,,,,,
3,,306010634.0,RAI - BANCROFT AVE - OAKLAND,"610 HEGENBERGER RD, STE 101B",OAKLAND,94621,510-553-1333,ROCKO GRAZIANO,Yes,2023-01-01,...,,,,file_1,23,,,,,
4,,306010731.0,PLEASANTON SANTA RITA DIALYSIS,"4270 ROSEWOOD DR, STE E",PLEASANTON,94588,2537334847,Vicki Kertzman,Yes,2023-01-01,...,,,,file_1,23,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4197,,306574012.0,FRESENIUS MEDICAL CARE WOODLAND,35 W MAIN ST,WOODLAND,95695,5306684503,ERIC ANDERSON,Yes,2018-01-01,...,,,,file_6,18,,,,,
4198,,306574019.0,WEST SACRAMENTO DIALYSIS CENTER,"3450 INDUSTRIAL BLVD, STE 100",WEST SACRAMENTO,95691,2537334847,Vicki Kertzman,Yes,2018-01-01,...,,,,file_6,18,,,,,
4199,,306584006.0,MARYSVILLE DIALYSIS CENTER,1015 8TH ST,MARYSVILLE,95901,2537334847,Vicki Kertzman,Yes,2018-01-01,...,,,,file_6,18,,,,,
4200,,306584011.0,BABY BUDDIES BIRTH CENTER,"1908 N BEALE RD, STE C",MARYSVILLE,95901,530-743-6888,Rachel Farrell,Yes,2018-01-01,...,,,,file_6,18,,,,,


In [73]:
data_dictionary

Unnamed: 0,Page,Line,Column,SIERA Dataset Header (2019),ALIRTS Dataset Header (2017),Notes,source_file,year
0,1,1,1,FAC_NAME,FAC_NAME,,file_1,19
1,1,2,1,FAC_NO,OSHPD_ID,,file_1,19
2,1,3,1,FAC_STR_ADDR,,Address 1 and Address 2 combined into single l...,file_1,19
3,1,4,1,FAC_CITY,FAC_CITY,,file_1,19
4,1,5,1,FAC_ZIP,FAC_ZIPCODE,,file_1,19
...,...,...,...,...,...,...,...,...
124,5,23,2,PROJ_EXPENDITURES_04,PROJ_04_PROJTD_CAP_EXP,,file_1,19
125,5,23,3,OSHPD_PROJ_NO_04,PROJ_04_OSHPD_PROJ_NO,,file_1,19
126,5,24,1,DEPROJ_05,PROJ_05_DESCRIP_CAP_EXP,,file_1,19
127,5,24,2,PROJ_EXPENDITURES_05,PROJ_05_PROJTD_CAP_EXP,,file_1,19


In [74]:
# Creating a dictionary of old an new column names to rename columns in the pre-2018 dataframe.

old_names = data_dictionary["ALIRTS Dataset Header (2017)"]

new_names = data_dictionary["SIERA Dataset Header (2019)"]

name_mapping = dict(zip(old_names, new_names))

# Renaming the columns in the pre-2018 dataframe.

pre_2018_df = pre_2018_df.rename(columns=name_mapping)

In [75]:
pre_2018_df

Unnamed: 0,FAC_NO,FAC_NAME,FAC_ADDRESS_ONE,FAC_ADDRESS_TWO,FAC_CITY,FAC_ZIP,FAC_PHONE,FAC_ADMIN_NAME,FAC_OPERATED_THIS_YR,FAC_OP_PER_BEGIN_DT,...,PROJ_EXPENDITURES_03,OSHPD_PROJ_NO_03,DEPROJ_04,PROJ_EXPENDITURES_04,OSHPD_PROJ_NO_04,DEPROJ_05,PROJ_EXPENDITURES_05,OSHPD_PROJ_NO_05,source_file,year
0,306013662.0,UNION CITY DIALYSIS CENTER,32930 ALVARADO NILES ROAD NO.300,,UNION CITY,94587,253-733-4847,Vicki Kertzman,Yes,2013-01-01,...,,,,,,,,,file_1,sx
1,306013683.0,BERKELEY DIALYSIS,2920 TELEGRAPH AVENUE,,BERKELEY,94705,253-733-4847,Vicki Kertzman,No,2013-01-01,...,,,,,,,,,file_1,sx
2,306104002.0,ALLIANT INTERNATIONAL UNIVERSITY,5130 EAST CLINTON WAY,,FRESNO,93727,559-253-2277,"Robert N. Harris, Ph.D.",Yes,2013-01-01,...,,,,,,,,,file_1,sx
3,306121019.0,FRESENIUS MEDICAL CARE OF EUREKA,2765 TIMBER RIDGE LANE,,EUREKA,95501,707-445-2033,Clarke Sabandal,Yes,2013-01-01,...,,,,,,,,,file_1,sx
4,306134002.0,EL CENTRO DESERT VALLEY DIALYSIS CENTER,110 SO. FIFTH STREET,,EL CENTRO,92243,760-353-0353,Ana Charves,Yes,2013-01-01,...,,,,,,,,,file_1,sx
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3074,306190916.0,ROWLAND HEIGHTS DIALYSIS,17875 COLIMA RD,STE A,CITY OF INDUSTRY,91748,626-964-5849,Katherine Crespo,No,2017-01-01,...,,,,,,,,,file_5,17
3075,306334726.0,FRESENIUS MEDICAL CARE MORENO VALLEY,27420 IRIS AVE.,,MORENO VALLEY,92555,951-242-9196,JASON BAUER,Yes,2017-01-01,...,,,,,,,,,file_5,17
3076,306196095.0,FMC DIALYSIS SERVICES OF WEST COVINA,1540 W WEST COVINA PKWY,,WEST COVINA,91790,626-337-8007,ANIL VAIDYA,Yes,2017-01-01,...,,,,,,,,,file_5,17
3077,306304303.0,RAI - LAGUNA CANYON - IRVINE,16255 LAGUNA CANYON RD,,IRVINE,92618-3603,949-727-4495,JASON BAUER,Yes,2017-01-01,...,,,,,,,,,file_5,17


In [76]:
# Creating a function to combine street address columns in pre-2018 dataframe.

def combine_street_address(df, col1, col2, new_col_name):
    
    # Combine columns, handling NaN values
    df[new_col_name] = df[col1].fillna('').astype(str) + df[col2].fillna('').apply(lambda x: f', {x}' if x else '')
    
    # Remove trailing comma and space if col2 was empty
    df[new_col_name] = df[new_col_name].str.rstrip(', ')

    # Remove original columns
    df.drop(columns=[col1, col2], inplace=True)

    return df

In [77]:
# Creating combined street adress and parent company address columns in the pre-2018 dataframe.

combine_street_address(pre_2018_df, "FAC_ADDRESS_ONE", "FAC_ADDRESS_TWO", "FAC_STR_ADDR")

combine_street_address(pre_2018_df, "PARENT_ADDRESS_ONE", "PARENT_ADDRESS_TWO", "FAC_PAR_CORP_BUS_ADDR")

Unnamed: 0,FAC_NO,FAC_NAME,FAC_CITY,FAC_ZIP,FAC_PHONE,FAC_ADMIN_NAME,FAC_OPERATED_THIS_YR,FAC_OP_PER_BEGIN_DT,FAC_OP_PER_END_DT,FAC_PAR_CORP_NAME,...,DEPROJ_04,PROJ_EXPENDITURES_04,OSHPD_PROJ_NO_04,DEPROJ_05,PROJ_EXPENDITURES_05,OSHPD_PROJ_NO_05,source_file,year,FAC_STR_ADDR,FAC_PAR_CORP_BUS_ADDR
0,306013662.0,UNION CITY DIALYSIS CENTER,UNION CITY,94587,253-733-4847,Vicki Kertzman,Yes,2013-01-01,2013-12-31,DaVita HealthCare Partners,...,,,,,,,file_1,sx,32930 ALVARADO NILES ROAD NO.300,1423 Pacific Ave
1,306013683.0,BERKELEY DIALYSIS,BERKELEY,94705,253-733-4847,Vicki Kertzman,No,2013-01-01,2013-12-31,,...,,,,,,,file_1,sx,2920 TELEGRAPH AVENUE,
2,306104002.0,ALLIANT INTERNATIONAL UNIVERSITY,FRESNO,93727,559-253-2277,"Robert N. Harris, Ph.D.",Yes,2013-01-01,2013-12-31,Alliant International University,...,,,,,,,file_1,sx,5130 EAST CLINTON WAY,5130 East Clinton Way
3,306121019.0,FRESENIUS MEDICAL CARE OF EUREKA,EUREKA,95501,707-445-2033,Clarke Sabandal,Yes,2013-01-01,2013-12-31,,...,,,,,,,file_1,sx,2765 TIMBER RIDGE LANE,
4,306134002.0,EL CENTRO DESERT VALLEY DIALYSIS CENTER,EL CENTRO,92243,760-353-0353,Ana Charves,Yes,2013-01-01,2013-12-31,,...,,,,,,,file_1,sx,110 SO. FIFTH STREET,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3074,306190916.0,ROWLAND HEIGHTS DIALYSIS,CITY OF INDUSTRY,91748,626-964-5849,Katherine Crespo,No,2017-01-01,2017-12-31,,...,,,,,,,file_5,17,"17875 COLIMA RD, STE A",
3075,306334726.0,FRESENIUS MEDICAL CARE MORENO VALLEY,MORENO VALLEY,92555,951-242-9196,JASON BAUER,Yes,2017-01-01,2017-12-31,,...,,,,,,,file_5,17,27420 IRIS AVE.,
3076,306196095.0,FMC DIALYSIS SERVICES OF WEST COVINA,WEST COVINA,91790,626-337-8007,ANIL VAIDYA,Yes,2017-01-01,2017-12-31,,...,,,,,,,file_5,17,1540 W WEST COVINA PKWY,
3077,306304303.0,RAI - LAGUNA CANYON - IRVINE,IRVINE,92618-3603,949-727-4495,JASON BAUER,Yes,2017-01-01,2017-12-31,,...,,,,,,,file_5,17,16255 LAGUNA CANYON RD,


In [78]:
post_2018_df

Unnamed: 0,Description,FAC_NO,FAC_NAME,FAC_STR_ADDR,FAC_CITY,FAC_ZIP,FAC_PHONE,FAC_ADMIN_NAME,FAC_OPERATED_THIS_YR,FAC_OP_PER_BEGIN_DT,...,MEANS_FOR_ACQUISITION_08,MEANS_FOR_ACQUISITION_09,MEANS_FOR_ACQUISITION_10,source_file,year,OSHPD_PROJ_NO_01,OSHPD_PROJ_NO_02,OSHPD_PROJ_NO_03,OSHPD_PROJ_NO_04,OSHPD_PROJ_NO_05
0,,306010568.0,FRESENIUS KIDNEY CARE UNION CITY,"1320 DECOTO RD, SUITE 100",UNION CITY,94587,5104042511,Rocko Graziano,Yes,2023-01-01,...,,,,file_1,23,,,,,
1,,306010583.0,OAKLAND LAUREL DIALYSIS,"3814 MACARTHUR BLVD, STE 201",OAKLAND,94619,2537334847,Vicki Kertzman,Yes,2023-01-01,...,,,,file_1,23,,,,,
2,,306010617.0,CASTRO VALLEY DIALYSIS,20359 LAKE CHABOT RD.,CASTRO VALLEY,94546,2537334847,Vicki Kertzman,Yes,2023-01-01,...,,,,file_1,23,,,,,
3,,306010634.0,RAI - BANCROFT AVE - OAKLAND,"610 HEGENBERGER RD, STE 101B",OAKLAND,94621,510-553-1333,ROCKO GRAZIANO,Yes,2023-01-01,...,,,,file_1,23,,,,,
4,,306010731.0,PLEASANTON SANTA RITA DIALYSIS,"4270 ROSEWOOD DR, STE E",PLEASANTON,94588,2537334847,Vicki Kertzman,Yes,2023-01-01,...,,,,file_1,23,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4197,,306574012.0,FRESENIUS MEDICAL CARE WOODLAND,35 W MAIN ST,WOODLAND,95695,5306684503,ERIC ANDERSON,Yes,2018-01-01,...,,,,file_6,18,,,,,
4198,,306574019.0,WEST SACRAMENTO DIALYSIS CENTER,"3450 INDUSTRIAL BLVD, STE 100",WEST SACRAMENTO,95691,2537334847,Vicki Kertzman,Yes,2018-01-01,...,,,,file_6,18,,,,,
4199,,306584006.0,MARYSVILLE DIALYSIS CENTER,1015 8TH ST,MARYSVILLE,95901,2537334847,Vicki Kertzman,Yes,2018-01-01,...,,,,file_6,18,,,,,
4200,,306584011.0,BABY BUDDIES BIRTH CENTER,"1908 N BEALE RD, STE C",MARYSVILLE,95901,530-743-6888,Rachel Farrell,Yes,2018-01-01,...,,,,file_6,18,,,,,


In [79]:
def compare_columns(df1, df2):
    set1 = set(df1.columns)
    set2 = set(df2.columns)
    
    only_in_df1 = set1 - set2
    only_in_df2 = set2 - set1
    
    return only_in_df1, only_in_df2

# Usage
columns_only_in_df1, columns_only_in_df2 = compare_columns(pre_2018_df, post_2018_df)

print("Columns only in df1:", columns_only_in_df1)
print("Columns only in df2:", columns_only_in_df2)

Columns only in df1: {'LIC_STATUS_DATE', 'LIC_ORIG_DATE', 'REPORT_STATUS', 'MCARE_PROVIDER_NO', 'ACLAIMS_NO', 'MCAL_PROVIDER_NO'}
Columns only in df2: {'HCAI_PROJ_NO_03', 'LICENSE_EFF_DATE', 'REVISED_DT', 'HCAI_PROJ_NO_02', 'LICENSE_EXP_DATE', 'FACILITY_LEVEL', 'Description', 'HCAI_PROJ_NO_04', 'CORRECTED_DT', 'HCAI_PROJ_NO_01', 'SUBMITTED_DT', 'REV_REPT_PREP_NAME', 'HCAI_PROJ_NO_05'}


In [80]:
pre_2018_df["FAC_NO"].nunique()

706

In [81]:
post_2018_df["FAC_NO"].nunique()

778

In [82]:
def compare_unique_values(df1, df2, column_name):
    # Get unique values from each DataFrame
    unique_df1 = set(df1[column_name].unique())
    unique_df2 = set(df2[column_name].unique())
    
    # Find values in df1 but not in df2
    only_in_df1 = unique_df1 - unique_df2
    
    # Find values in df2 but not in df1
    only_in_df2 = unique_df2 - unique_df1
    
    # Find values in both
    in_both = unique_df1.intersection(unique_df2)
    
    return only_in_df1, only_in_df2, in_both

# Usage
column_to_compare = 'FAC_NO'
only_in_df1, only_in_df2, in_both = compare_unique_values(pre_2018_df, post_2018_df, column_to_compare)

print(f"Unique values only in df1: {only_in_df1}")
print(f"Unique values only in df2: {only_in_df2}")
print(f"Unique values in both: {in_both}")

Unique values only in df1: {np.float64(306494082.0), np.float64(306134019.0), np.float64(306481027.0), np.float64(306304135.0), np.float64(306374538.0), np.float64(306014219.0), np.float64(306434187.0), np.float64(306190988.0), np.float64(306194574.0), np.float64(306304142.0), np.float64(306304528.0), np.float64(306364172.0), np.float64(306374162.0), np.float64(306154003.0), np.float64(306196503.0), np.float64(306334106.0), np.float64(306314010.0), np.float64(306491037.0), np.float64(306364061.0), np.float64(306196511.0), np.float64(306154016.0), np.float64(306434079.0), np.float64(306394019.0), np.float64(306240036.0), np.float64(306234021.0), np.float64(306384038.0), np.float64(306374179.0), np.float64(306014248.0), np.float64(306190632.0), np.float64(306364209.0), np.float64(306540082.0), np.float64(306344117.0), np.float64(306234041.0), np.float64(306191165.0), np.float64(nan), np.float64(306334015.0), np.float64(306244032.0), np.float64(306304573.0), np.float64(306374088.0), np.fl

In [83]:
def compare_shared_column_types(df1, df2):
    # Find shared columns
    shared_columns = list(set(df1.columns) & set(df2.columns))
    
    if not shared_columns:
        print("No shared columns found between the DataFrames")
        return None
    
    # Compare data types
    comparison = {}
    for col in shared_columns:
        type1 = df1[col].dtype
        type2 = df2[col].dtype
        comparison[col] = {
            'df1_type': type1,
            'df2_type': type2,
            'match': type1 == type2
        }
    
    # Convert to DataFrame for easy viewing
    comparison_df = pd.DataFrame.from_dict(comparison, orient='index')
    
    return comparison_df

# Usage
type_comparison = compare_shared_column_types(pre_2018_df, post_2018_df)

if type_comparison is not None:
    print("Shared column type comparison:")
    print(type_comparison)
    
    # Identify mismatched columns
    mismatched = type_comparison[type_comparison['match'] == False]
    if not mismatched.empty:
        print("\nColumns with mismatched types:")
        print(mismatched)
    else:
        print("\nAll shared columns have matching types.")

Shared column type comparison:
                               df1_type df2_type  match
source_file                      object   object   True
NET_OPER_TOT                    float64  float64   True
MEANS_FOR_ACQUISITION_07        float64  float64   True
MEANS_FOR_ACQUISITION_05        float64  float64   True
DEPROJ_05                       float64  float64   True
...                                 ...      ...    ...
PROJ_EXPENDITURES_05            float64  float64   True
APP_IN_HOME_TRAINING_CAPD_CCPD   object   object   True
DT_ACQUIRE_08                   float64  float64   True
DT_ACQUIRE_04                   float64  float64   True
OTHER_OPER_REVENUE_OTHER_TOT    float64  float64   True

[123 rows x 3 columns]

Columns with mismatched types:
                         df1_type        df2_type  match
MEANS_FOR_ACQUISITION_01  float64          object  False
DEEQUIP_01                float64          object  False
SENATE_DIST               float64          object  False
DT_ACQUIRE_01

In [84]:
# Converting aquisition-related columns in pre-2018 dataframe to types in post-2018 dataframe.

# Pre-2018 dataframe doesn't contain any values for these columns pre-2018, which is why they were imputed differently from those post-2018, which do contain information.

pre_2018_df["DT_ACQUIRE_01"] = pd.to_datetime(pre_2018_df["DT_ACQUIRE_01"])

pre_2018_df["DEEQUIP_01"] = pre_2018_df["DEEQUIP_01"].astype("str")

pre_2018_df["MEANS_FOR_ACQUISITION_01"] = pre_2018_df["MEANS_FOR_ACQUISITION_01"].astype("str")

In [85]:
# Creating a function to remove the string "District " from SENATE_DIST, ASSEMBLY_DIST, and CONGRESS_DIST columns in post-2018 dataframe. 

def clean_and_convert_to_numeric(df, columns):
    def clean_numeric(value):
        if pd.isna(value):
            return value
        # Remove all non-digit characters
        cleaned = re.sub(r'\D', '', str(value))
        return cleaned if cleaned else None

    for col in columns:
        if col not in df.columns:
            print(f"Warning: Column '{col}' not found in the DataFrame. Skipping.")
            continue
        
        # Apply the cleaning function and convert to integer
        df[col] = df[col].apply(clean_numeric).astype('Int64')
    
    return df

columns_to_clean = ['SENATE_DIST', 'CONGRESS_DIST', 'ASSEMBLY_DIST']
post_2018_df = clean_and_convert_to_numeric(post_2018_df, columns_to_clean)

In [86]:
# Creating a function to merge the pre-2018 and post-2018 dataframes
# Finds all shared columns between df1 and df2 using set intersection.
# Checks if there are any shared columns. If not, it raises an error.
# Merges the DataFrames using all shared columns.

# Using how='outer' ensures that all rows from both DataFrames are kept, even if there's no match on all shared columns.
# suffixes=('_df1', '_df2') are added to disambiguate column names that are in both DataFrames but weren't used for merging.

def merge_on_shared_columns(df1, df2):
    # Find shared columns
    shared_columns = list(set(df1.columns) & set(df2.columns))
    
    # Ensure there are shared columns
    if not shared_columns:
        raise ValueError("No shared columns found between the DataFrames")
    
    # Merge DataFrames on all shared columns
    merged_df = pd.merge(df1, df2, on=shared_columns, how='outer', suffixes=('_df1', '_df2'))
    
    return merged_df

In [87]:
# Attempting to merge pre-2018 and post-2018 dataframes using shared columns
# If successful, print the shape of the merged dataframe
# If unsuccessful due to no shared columns, catch and print the error

try:
    merged_df = merge_on_shared_columns(pre_2018_df, post_2018_df)
    print("Merge successful")
    print(f"Shape of merged DataFrame: {merged_df.shape}")
except ValueError as e:
    print(f"Error: {e}")

Merge successful
Shape of merged DataFrame: (7281, 142)


In [88]:
def compare_fac_no(pre_2018_df, post_2018_df):
    # Get unique FAC_NO values from each dataframe
    pre_2018_fac_no = set(pre_2018_df['FAC_NO'].dropna().unique())
    post_2018_fac_no = set(post_2018_df['FAC_NO'].dropna().unique())

    # Find FAC_NO values only in pre_2018_df
    only_in_pre = pre_2018_fac_no - post_2018_fac_no

    # Find FAC_NO values only in post_2018_df
    only_in_post = post_2018_fac_no - pre_2018_fac_no

    # Find FAC_NO values in both dataframes
    in_both = pre_2018_fac_no.intersection(post_2018_fac_no)

    print(f"Number of FAC_NO only in pre_2018_df: {len(only_in_pre)}")
    print(f"Number of FAC_NO only in post_2018_df: {len(only_in_post)}")
    print(f"Number of FAC_NO in both dataframes: {len(in_both)}")
    print(f"Total unique FAC_NO across both dataframes: {len(pre_2018_fac_no.union(post_2018_fac_no))}")

# Use the function
compare_fac_no(pre_2018_df, post_2018_df)

Number of FAC_NO only in pre_2018_df: 59
Number of FAC_NO only in post_2018_df: 131
Number of FAC_NO in both dataframes: 647
Total unique FAC_NO across both dataframes: 837


In [89]:
merged_df = merged_df.dropna(subset=['FAC_NO'])

merged_df

Unnamed: 0,FAC_NO,FAC_NAME,FAC_CITY,FAC_ZIP,FAC_PHONE,FAC_ADMIN_NAME,FAC_OPERATED_THIS_YR,FAC_OP_PER_BEGIN_DT,FAC_OP_PER_END_DT,FAC_PAR_CORP_NAME,...,REVISED_DT,CORRECTED_DT,LICENSE_EFF_DATE,LICENSE_EXP_DATE,FACILITY_LEVEL,HCAI_PROJ_NO_01,HCAI_PROJ_NO_02,HCAI_PROJ_NO_03,HCAI_PROJ_NO_04,HCAI_PROJ_NO_05
0,306540615.0,ARA KAWAEH DIALYSIS CENTER,VISALIA,93277,5597419263,Teresa Shaw,Yes,2023-01-01,2023-12-31,Teresa Shaw,...,,,2023-06-17,2024-06-16,Parent Facility,,,,,
1,306197943.0,HIGH DESERT HEALTH SYSTEM AMBULATORY SURGICAL ...,LANCASTER,93535,661 471-4000,Lee Dunham,Yes,2023-01-01,2023-12-31,,...,,,2023-11-01,2024-10-31,Parent Facility,,,,,
2,306198064.0,"MARTIN LUTHER KING, JR. AMBULATORY SURGERY CENTER",LOS ANGELES,90059,213 699-7102,Michael mills,Yes,2023-01-01,2023-12-31,,...,,,2023-05-28,2024-05-27,Parent Facility,,,,,
3,306197915.0,SHRINERS FOR CHILDREN AMBULATORY SURGICAL CENTER,PASADENA,91105,626-389-9300,Kanayo Keri,Yes,2023-01-01,2023-12-31,Shriners Hospitals for Children,...,,,2023-10-06,2024-10-05,Parent Facility,,,,,
4,306196511.0,HIGH DESERT HEALTH SYSTEM AMBULATORY SURGICAL ...,LANCASTER,93536,661-948-8581,Beryl Brooks,Yes,2013-01-01,2013-12-31,,...,,,NaT,NaT,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7275,306312304.0,RAI - HARDING BLVD. - ROSEVILLE,ROSEVILLE,95678,916-786-2728,ERIC ANDERSON,No,2018-01-01,2018-03-31,"RAI CARE CENTERS OF NORTHERN CALIFORNIA I, LLC",...,02/22/2019 03:48 PM,,2017-04-01,2018-03-31,Parent Facility,,,,,
7276,306190407.0,TORRANCE EMERALD DIALYSIS,TORRANCE,90503,2537334847,Vicki Kertzman,No,2018-01-01,2018-10-23,,...,,10/29/2019 03:26 PM,2018-10-24,2019-04-23,Parent Facility,,,,,
7277,306190350.0,PALMS VALLEY DIALYSIS,PALMDALE,93551,2537334847,Vicki Kertzman,No,2018-01-01,2018-04-09,,...,,10/29/2019 03:16 PM,2018-10-10,2019-04-09,Parent Facility,,,,,
7278,306194003.0,PARAMOUNT DIALYSIS CENTER,PARAMOUNT,90723,2537334847,Vicki Kertzman,No,2018-01-01,2018-02-04,DAVITA INC,...,,,2017-02-05,2018-02-04,Parent Facility,,,,,


In [93]:
# Converting some columns to string to match the data types in the post-2018 dataframe
# Necessary because the data types are inconsistent across the two dataframes
def convert_problematic_columns(df):
    for col in df.columns:
        # Check if column contains any non-numeric values
        if df[col].dtype == 'object' and not pd.api.types.is_numeric_dtype(df[col]):
            # Convert to string, replacing NaN with an empty string
            df[col] = df[col].fillna('').astype(str)
        elif df[col].dtype == 'object' and pd.api.types.is_numeric_dtype(df[col]):
            # If it's all numeric, convert to float (which can handle NaN)
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

# Apply the conversion function
merged_df = convert_problematic_columns(merged_df)

# Generating a timestamp for the filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Defining output path without timestamp to replace the prior version
output_path = '../../../003_data/001_raw-data/2013-2023_CHHS_dialysis-facility_data.parquet'

# Saving the merged dataframe as a parquet file
# Using parquet in order to preserve data types, optimize storage, and improve read performance.

try:
    # Saving as Parquet
    # Using compression='snappy' to optimize storage
    merged_df.to_parquet(output_path, index=False, compression='snappy')

    # Add metadata
    table = pa.Table.from_pandas(merged_df)
    metadata = table.schema.metadata

    metadata.update({
        b'created_at': str(datetime.now()).encode('utf-8'),
        b'description': b'Merged specialty care data',
        b'version': b'1.0',
        b'cleaning_steps': b'''
            1. Standardized naming convention for Census Tract columns 
            2. Renamed columns in the pre-2018 dataframe to match the post-2018 dataframe using a data dictionary.
            3. Combined street address columns in the pre-2018 dataframe.
            4. Cleaned and converted specific columns to numeric types in the post-2018 dataframe.
            5. Converted acquisition-related columns in the pre-2018 dataframe to match the data types in the post-2018 dataframe.
            6. Dropped rows with missing FAC_NO in the merged dataframe.
            7. Converted columns with mixed types to string and numeric columns to float in the merged dataframe.
        '''
    })

    updated_table = table.replace_schema_metadata(metadata)
    pq.write_table(updated_table, output_path)

    print(f"Data saved to {output_path}")
except Exception as e:
    print(f"Error saving data: {e}")
    raise

Data saved to ../../../003_data/001_raw-data/2013-2023_CHHS_dialysis-facility_data.parquet
