In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable

In [0]:
%run /Workspace/Users/evansavo@gmail.com/Population_Health_&_Readmission_Risk/1.setup/utilities

In [0]:
# setup dbutils
dbutils.widgets.text('catalog', 'phr', 'Catalog')
dbutils.widgets.text('data_source', '', 'Data Source')

In [0]:
catalog = dbutils.widgets.get('catalog')
data_source = dbutils.widgets.get('data_source')

base_path = f'databricks_cms_synthetic_public_use_files_synpuf.cms_synpuf_ext.{data_source}'


print(base_path)

# Bronze level

In [0]:
# Load table
df = spark.read.table(base_path).withColumn('read_timestamp', F.current_timestamp())

display(df.limit(2))

In [0]:
df.printSchema()

In [0]:
# Save data
(df.write
 .format('delta')
 .option('delta.enableChangeDataFeed', 'true')
 .mode('overwrite')
 .saveAsTable(f'{catalog}.{bronze_schema}.{data_source}')
)

# Silver Level

In [0]:
bronze_df = spark.sql(f'select * from {catalog}.{bronze_schema}.{data_source}')
display(bronze_df.limit(5))

In [0]:
# Fix Numeric Datatype (Amounts)
silver_df = (bronze_df
            .withColumn('MEDREIMB_IP', F.col('MEDREIMB_IP').cast('double'))
            .withColumn('BENRES_IP', F.col('BENRES_IP').cast('double'))
            .withColumn('PPPYMT_IP', F.col('PPPYMT_IP').cast('double'))
            .withColumn('MEDREIMB_OP', F.col('MEDREIMB_OP').cast('double'))
            .withColumn('BENRES_OP', F.col('BENRES_OP').cast('double'))
            .withColumn('PPPYMT_OP', F.col('PPPYMT_OP').cast('double'))
            .withColumn('MEDREIMB_CAR', F.col('MEDREIMB_CAR').cast('double'))
            .withColumn('BENRES_CAR', F.col('BENRES_CAR').cast('double'))
            .withColumn('PPPYMT_CAR', F.col('PPPYMT_CAR').cast('double'))
    )

In [0]:
# Fix Numeric Datatype (Months)
silver_df = (
    silver_df
    .withColumn('BENE_HI_CVRAGE_TOT_MONS', F.col('BENE_HI_CVRAGE_TOT_MONS').cast('int'))
    .withColumn('BENE_SMI_CVRAGE_TOT_MONS', F.col('BENE_SMI_CVRAGE_TOT_MONS').cast('int'))
    .withColumn('BENE_HMO_CVRAGE_TOT_MONS', F.col('BENE_HMO_CVRAGE_TOT_MONS').cast('int'))
    .withColumn('PLAN_CVRG_MOS_NUM', F.col('PLAN_CVRG_MOS_NUM').cast('int'))
)

In [0]:
# Clean BENE_BIRTH_DT
silver_df = (silver_df
            .withColumn('BENE_BIRTH_DT', F.to_date(F.col('BENE_BIRTH_DT'), 'yyyyMMdd'))
            .withColumn('BENE_DEATH_DT', F.to_date(F.col('BENE_DEATH_DT'), 'yyyyMMdd'))
)

In [0]:
# Create Age column
ref_date = F.lit('2010-12-31')
silver_df = silver_df.withColumn(
    'AGE', 
    F.datediff(
        F.coalesce(F.col('BENE_DEATH_DT'), ref_date),
        F.col('BENE_BIRTH_DT')
    ) / 365.25
)

In [0]:
# Drop duplicates if any
silver_df = silver_df.dropDuplicates()

In [0]:
# Rearrange columns
silver_df = silver_df.select(
['DESYNPUF_ID',
 'BENE_BIRTH_DT',
 'BENE_DEATH_DT',
 'AGE',
 'BENE_SEX_IDENT_CD',
 'BENE_RACE_CD',
 'BENE_ESRD_IND',
 'SP_STATE_CODE',
 'BENE_COUNTY_CD',
 'BENE_HI_CVRAGE_TOT_MONS',
 'BENE_SMI_CVRAGE_TOT_MONS',
 'BENE_HMO_CVRAGE_TOT_MONS',
 'PLAN_CVRG_MOS_NUM',
 'SP_ALZHDMTA',
 'SP_CHF',
 'SP_CHRNKIDN',
 'SP_CNCR',
 'SP_COPD',
 'SP_DEPRESSN',
 'SP_DIABETES',
 'SP_ISCHMCHT',
 'SP_OSTEOPRS',
 'SP_RA_OA',
 'SP_STRKETIA',
 'MEDREIMB_IP',
 'BENRES_IP',
 'PPPYMT_IP',
 'MEDREIMB_OP',
 'BENRES_OP',
 'PPPYMT_OP',
 'MEDREIMB_CAR',
 'BENRES_CAR',
 'PPPYMT_CAR',
 'read_timestamp',]
)

In [0]:
# Write to silver schema
(silver_df.write
 .format('delta')
 .mode('overwrite')
 .option('delta.enableChangeDataFeed', 'true')
 .saveAsTable(f'{catalog}.{silver_schema}.{data_source}')
)