In [1]:
import pandas as pd
import scripts as func


In [2]:
path_dfc = 'DATA\DIALYSIS_FASCILITIES\DFC_FACILITY.csv'
df_dfc_raw = pd.read_csv(path_dfc)
df_dfc_raw_CA = df_dfc_raw[df_dfc_raw['State'] == 'CA']
dfc_columns = ['CMS Certification Number (CCN)',
              'Facility Name',
              'County/Parish',
              '# of Dialysis Stations',
              'Profit or Non-Profit',
              'Chain Owned',
              'Chain Organization']

df_dfc = df_dfc_raw_CA[dfc_columns]


In [3]:
# import and filter surevy data
path_chaps = 'DATA\DIALYSIS_FASCILITIES\ICH_CAHPS_FACILITY.csv'
df_chaps_raw = pd.read_csv(path_chaps)

# Filter the data to only include California
df_chaps_RAW_CA = df_chaps_raw[df_chaps_raw['State'] == 'CA']

chaps_columns = ["CMS Certification Number (CCN)",
                "Total number of completed interviews from the Fall and Spring Surveys",
                "Survey response rate",
                "Star rating of the dialysis facility",
                "ICH CAHPS Survey of patients' experiences star rating",
                "Linearized score of nephrologists' communication and caring",
                "Linearized score of quality of dialysis center care and operations",
                "Linearized score of providing information to patients",
                "Linearized score of rating of the nephrologist",
                "Linearized score of rating of the dialysis center staff",
                "Linearized score of rating of the dialysis facility",]

df_chaps = df_chaps_RAW_CA[chaps_columns]

# Create a main dataframe
main_df = pd.DataFrame(df_dfc['CMS Certification Number (CCN)'])


In [4]:
# update the main dataframe with dfc and chaps data
df_fasc = df_dfc.merge(df_chaps, on='CMS Certification Number (CCN)', how='left')

data_dict = {
    'CMS Certification Number (CCN)': 'CCN',
    'Facility Name': 'FAC_NAME',
    'County/Parish': 'COUNTY',
    '# of Dialysis Stations': 'STATIONS',
    'Profit or Non-Profit': 'PROF_NP',
    'Chain Owned': 'CHAIN_OWN',
    'Chain Organization': 'CHAIN_ORG',
    'Linearized score of nephrologists\' communication and caring': 'COMM_SCR',
    'Linearized score of quality of dialysis center care and operations': 'QUALITY_SCR',
    'Linearized score of providing information to patients': 'INFO_SCR',
    'Linearized score of rating of the nephrologist': 'PHYS_SCR',
    'Linearized score of rating of the dialysis center staff': 'STAFF_SCR',
    'Linearized score of rating of the dialysis facility': 'FAC_SCR',
    'Star rating of the dialysis facility': 'FAC_STAR',
    'Total number of completed interviews from the Fall and Spring Surveys': 'SURVEY_COUNT',
    'ICH CAHPS Survey of patients\' experiences star rating': 'XP_STAR',
    'Survey response rate': 'SURVEY_RATE',

}

df_fasc.rename(columns=data_dict, inplace=True)

df_fasc.head()

Unnamed: 0,CCN,FAC_NAME,COUNTY,STATIONS,PROF_NP,CHAIN_OWN,CHAIN_ORG,SURVEY_COUNT,SURVEY_RATE,FAC_STAR,XP_STAR,COMM_SCR,QUALITY_SCR,INFO_SCR,PHYS_SCR,STAFF_SCR,FAC_SCR
0,52305,SANTA CLARA VALLEY RENAL CARE CENTER,Santa Clara,25,Non-profit,No,Independent,47.0,19.0,3.0,4.0,85.0,85.0,81.0,88.0,88.0,89.0
1,52311,St. Joseph Hospital Renal Center,Orange,39,Non-profit,No,Independent,,,,,,,,,,
2,52321,Childrens Hospital of Los Angeles,Los Angeles,10,Non-profit,No,Independent,,,,,,,,,,
3,52323,Kaiser Foundation Hospital Medical Ctr.- Sunset,Los Angeles,30,Non-profit,Yes,Kaiser Permanente,52.0,25.0,4.0,4.0,76.0,82.0,84.0,82.0,88.0,91.0
4,52334,Arrowhead Regional Medical Center,San Bernardino,8,Non-profit,No,Independent,,,,,,,,,,


In [None]:
df_fasc.info()

In [None]:
# Summary statistics for numerical columns
df_fasc.describe().T



In [None]:

# Value counts for categorical columns
df_fasc['CHAIN_ORG'].value_counts()


In [None]:
# Count missing values in each column
missing_counts = df_fasc.isnull().sum()
print(missing_counts)
# Visualizing missing data using seaborn
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
sns.heatmap(df_fasc.isnull(), cbar=False, cmap='viridis')


In [None]:
df_fasc.columns

In [None]:
import seaborn as sns

columns = ['STATIONS', 'SURVEY_COUNT', 'SURVEY_RATE', 'FAC_STAR', 'XP_STAR', 'COMM_SCR', 'QUALITY_SCR', 'INFO_SCR', 'PHYS_SCR', 'STAFF_SCR', 'FAC_SCR']
# Create the pairplot with the filtered columns
sns.pairplot(data=df_fasc, vars = columns, hue='CHAIN_OWN')


In [None]:
columns = ['STATIONS', 'SURVEY_COUNT', 'SURVEY_RATE', 'FAC_STAR', 'XP_STAR', 'COMM_SCR', 'QUALITY_SCR', 'INFO_SCR', 'PHYS_SCR', 'STAFF_SCR', 'FAC_SCR']
sns.pairplot(data=df_fasc, vars = columns, hue='PROF_NP')


In [None]:
# aggregate by county
agg_funcs = {'CCN' : 'count',
             'STATIONS': 'sum',
             'PROF_NP': 'mean',
             'CHAIN_OWN': 'mean',
             'SURVEY_COUNT': 'sum',
             'SURVEY_RATE': 'mean',
             'FAC_STAR': 'mean',
             'XP_STAR': 'mean',
             'COMM_SCR': 'mean',
             'QUALITY_SCR': 'mean',
             'INFO_SCR': 'mean',
             'PHYS_SCR': 'mean',
             'STAFF_SCR': 'mean',
             'FAC_SCR': 'mean'}

agg_col =['COUNTY', 'STATIONS', 'PROF_NP', 'CHAIN_OWN', 'SURVEY_COUNT',
          'SURVEY_RATE', 'FAC_STAR', 'XP_STAR','COMM_SCR', 'QUALITY_SCR',
          'INFO_SCR', 'PHYS_SCR', 'STAFF_SCR','FAC_SCR']

df_agg = df_fasc.copy()
df_agg['PROF_NP'] = df_agg['PROF_NP'].map({'Profit': 1, 'Non-Profit': 0})
df_agg['CHAIN_OWN'] = df_agg['CHAIN_OWN'].map({'Yes': 1, 'No': 0})

df_agg = df_agg.groupby('COUNTY').agg(agg_funcs)
df_agg.rename(columns={'CCN': 'FAC_COUNT'}, inplace=True)
df_agg.head()

In [None]:
df_agg.info()

In [None]:

sns.pairplot(data=df_agg)