In [0]:
spark

In [0]:
# Performing necessary imports required for the project
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
# Function for defining Struct type for different datasets
def define_schemas():    

    # Schema for Encounters Dataset
    encounters_schema = StructType([
        StructField("id", StringType(), True),
        StructField("start", TimestampType(), True),
        StructField("stop", TimestampType(), True),
        StructField("patient", StringType(), True),
        StructField("organization", StringType(), True),
        StructField("provider", StringType(), True),
        StructField("payer", StringType(), True),
        StructField("encounterclass", StringType(), True),
        StructField("code", IntegerType(), True),
        StructField("description", StringType(), True),
        StructField("base_encounter_cost", DoubleType(), True),
        StructField("total_claim_cost", DoubleType(), True),
        StructField("payer_coverage", DoubleType(), True),
        StructField("reasoncode", StringType(), True)
    ])

    # Schema for Patient Dataset
    patients_schema = StructType([
        StructField("id", StringType(), True),
        StructField("birthdate", DateType(), True),
        StructField("deathdate", DateType(), True),
        StructField("ssn", StringType(), True),
        StructField("drivers", StringType(), True),
        StructField("passport", StringType(), True),
        StructField("prefix", StringType(), True),
        StructField("first", StringType(), True),
        StructField("last", StringType(), True),
        StructField("suffix", StringType(), True),
        StructField("maiden", StringType(), True),
        StructField("marital", StringType(), True),
        StructField("race", StringType(), True),
        StructField("ethnicity", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("birthplace", StringType(), True),
        StructField("address", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("county", StringType(), True),
        StructField("fips", IntegerType(), True),
        StructField("zip", IntegerType(), True),
        StructField("lat", DoubleType(), True),
        StructField("lon", DoubleType(), True),
        StructField("healthcare_expenses", DoubleType(), True),
        StructField("healthcare_coverage", DoubleType(), True),
        StructField("income", IntegerType(), True),
        StructField("mrn", IntegerType(), True)
    ])

    # Schema for Conditions Dataset
    conditions_schema = StructType([
    StructField("start", TimestampType(),True),
    StructField("stop", TimestampType(),True),
    StructField("patient", StringType(),True),
    StructField("encounter", StringType(),True),
    StructField("code", StringType(),True),
    StructField("description", StringType(),True),
    ])


    # Schema for Immunization Dataset
    immunizations_schema = StructType([
        StructField("date", TimestampType(), True),
        StructField("patient", StringType(), True),
        StructField("encounter", StringType(), True),
        StructField("code", IntegerType(), True),
        StructField("description", StringType(), True)
    ])
    
    return patients_schema, encounters_schema, conditions_schema, immunizations_schema






### Data Extraction

In [0]:
def extract_data(spark, patients_schema, encounters_schema, conditions_schema, immunizations_schema):
    encounters_df = spark.read.csv("s3://health-care-data-bucket/encounters.csv", schema=encounters_schema, header=True)
    patients_df = spark.read.csv("s3://health-care-data-bucket/patients.csv", schema=patients_schema, header=True)
    conditions_df = spark.read.csv("s3://health-care-data-bucket/conditions.csv", schema=conditions_schema, header=True)
    immunizations_df = spark.read.csv("s3://health-care-data-bucket/immunizations.csv", schema=immunizations_schema, header=True)
    return patients_df, encounters_df, conditions_df, immunizations_df

### Data Cleaning and Transformation

In [0]:
def transform_data(patients_df, encounters_df, conditions_df, immunizations_df):

# Cleaning and Few Transformation PATIENT DATA 
    clean_patients_df = patients_df\
        .withColumn("gender", when(col("gender") == "M", "Male")\
                    .when(col("gender") == "F", "Female")\
                    .otherwise("Unknown"))\
        .withColumn("marital", when(col("marital") == "M", "Married")\
                    .when(col("marital") == "S", "Single")\
                    .when(col("marital") == "D", "Divorced")\
                    .otherwise("Unknown"))\
        .withColumnRenamed("id", "patient_id")\
        .withColumnRenamed("first", "first_name")\
        .withColumnRenamed("last", "last_name")\
        .withColumnRenamed("lat","latitude")\
        .withColumnRenamed("lon","longitutde")
    # clean_patients_df.limit(10).display()

# Cleaning and Few Transformation IMMUNIZATION DATA 
    clean_immunization_df = immunizations_df\
        .withColumn("date", immunizations_df["date"].cast("date")).withColumnRenamed('patient','vaccined_patient_id')\
        .withColumnRenamed("encounter","encounter_id")\
        .withColumnRenamed("code","vaccine_code")

# Cleaning and Few Transformation ENCOUNTERS DATA 
    #Start Date and Time Seperation
    clean_immunization_df = encounters_df\
        .withColumn("start_time", date_format('start', 'HH:mm:ss') )\
        .withColumn("start", encounters_df['start'].cast('date'))\
        .withColumn("end_time", date_format('stop', 'HH:mm:ss'))\
        .withColumn("stop", encounters_df['stop'].cast('date'))\
        .withColumnRenamed('id','encounter_id')\
        .withColumnRenamed('start','start_date')\
        .withColumnRenamed('stop','stop_date')\
        .withColumnRenamed('patient','patient_id')\
        .withColumnRenamed('code','encounter_code')

    # clean_immunization_df.limit(10).display()
    

### Code Execution

In [0]:

def execute():
    spark = SparkSession.builder.appName("Healthcare_ETL").getOrCreate()
    patients_schema, encounters_schema, conditions_schema, immunizations_schema = define_schemas()
    patients_df, encounters_df, conditions_df, immunizations_df = extract_data(spark, patients_schema, encounters_schema, conditions_schema, immunizations_schema )
    transform_data(patients_df, encounters_df, conditions_df, immunizations_df)

execute()

encounter_id,start_date,stop_date,patient_id,organization,provider,payer,encounterclass,encounter_code,description,base_encounter_cost,total_claim_cost,payer_coverage,reasoncode,start_time,end_time
be86bb53-1982-c56d-ee22-ac961787aa0c,2018-04-07,2018-04-07,bb8d3c0d-78f6-747e-bd03-9de9efd98a21,9d0e702d-50a0-3f4c-9126-0951d560fd4b,179a5ef5-b06b-39c2-82f8-b552b709eb3c,8fa6c185-e44e-3e34-8bd8-39be8694f4ce,ambulatory,1032,Hospital Encounter with Problem,85.55,85.55,0.0,,21:17:11,21:32:11
a185944c-70c1-3fdf-073a-9ba86d29606d,2019-10-25,2019-10-25,bb8d3c0d-78f6-747e-bd03-9de9efd98a21,9d0e702d-50a0-3f4c-9126-0951d560fd4b,179a5ef5-b06b-39c2-82f8-b552b709eb3c,8fa6c185-e44e-3e34-8bd8-39be8694f4ce,ambulatory,1032,Hospital Encounter with Problem,85.55,85.55,0.0,,12:17:11,12:32:11
67deb48d-6bc0-8142-189d-1f10abc7c6bc,2018-12-13,2018-12-13,a3a96fd1-3638-41d3-72dc-efc248f2b887,217cb6f6-e822-3831-9d9d-ffa104971042,7077be2a-5b48-35f1-98b8-5e5b5a42343b,26aab0cd-6aba-3e1b-ac5b-05c8867e762c,ambulatory,1032,Hospital Encounter with Problem,85.55,85.55,0.0,,23:11:54,23:26:54
de112f81-cfa9-ab77-8c56-e7ee074a0abe,2022-10-09,2022-10-09,b1a0a29e-113d-903c-6cef-016235be98e8,ae3eab22-8868-37bb-9a59-2b8bfe14bf34,1e9fb93b-b6e1-3e44-b1d6-cb0c323bee95,d31fccc3-1767-390d-966a-22a5156f4219,ambulatory,1032,Hospital Encounter with Problem,85.55,85.55,0.0,,01:53:22,02:08:22
ea6202ee-7152-3e80-d779-5a53fe351f19,2020-05-09,2020-05-09,bd603d4c-3093-2104-e5f0-360cb08b7536,901c2d40-1ca3-3879-9a20-c663b8adc0a9,ec66f0b4-c703-33ad-ac54-5c1480a450de,b046940f-1664-3047-bca7-dfa76be352a4,ambulatory,1032,Hospital Encounter with Problem,85.55,85.55,21.71,,13:34:31,13:49:31
1ace1153-4699-5c0c-4f3a-58cb3c2ec648,2022-03-05,2022-03-05,2afea9cf-f03f-7408-0535-d640b003c339,5018c664-e283-30eb-932a-529d9a19b3b5,0204406f-f2dd-35c4-8945-a4d788d4a287,b046940f-1664-3047-bca7-dfa76be352a4,ambulatory,1032,Hospital Encounter with Problem,85.55,85.55,0.0,,08:43:58,08:58:58
74c235c9-59c0-94b7-ec80-108da7a1165f,2017-08-31,2017-08-31,0fab3069-e6e1-33a8-c21c-580c6cc989f4,b6eeaaf7-1683-3bcb-b6ee-81ce304636ef,9deecdc7-972f-378a-8659-6981b6cd3bd4,e03e23c9-4df1-3eb6-a62d-f70f02301496,ambulatory,1032,Hospital Encounter with Problem,85.55,85.55,0.0,,17:23:50,17:38:50
8f45763c-ea2d-e462-7f94-5b0a95247b8e,2022-05-12,2022-05-12,4361f740-2bce-01eb-00d1-5b3344ae7464,0fedae9f-701f-3317-9b2f-69aea2202cdc,abdf12f9-2a02-3ca4-8b36-673a675a6771,b046940f-1664-3047-bca7-dfa76be352a4,ambulatory,1032,Hospital Encounter with Problem,85.55,85.55,0.0,,08:13:56,08:28:56
7eedb204-e714-7944-8fe4-35bcc593daf1,2016-12-18,2016-12-18,dc5fe737-c79e-66d7-b834-cec1ae473dab,20df65a4-7567-3066-b680-0f71b0c31d38,13bd9bb2-e784-35c1-8a3d-cb10f8488571,0133f751-9229-3cfd-815f-b6d4979bdd6a,ambulatory,1032,Hospital Encounter with Problem,85.55,85.55,85.55,,06:40:37,06:55:37
0f640694-f5f5-ef8e-a582-c2b2ceaf8e6a,2019-07-28,2019-07-28,a9562614-9c3a-6246-a12e-10cff583a743,a537b406-fdfa-36b4-84da-12512e7e6c63,e714484d-1a16-3a8c-98fe-842ff9655cd5,26aab0cd-6aba-3e1b-ac5b-05c8867e762c,ambulatory,1032,Hospital Encounter with Problem,85.55,85.55,0.0,,22:24:47,22:39:47
