In [0]:
#Reading Hospital A Encounters data
df_hosa = spark.read.parquet("/mnt/bronze/hosa/encounters")

#Reading Hospital B Encounters data
df_hosb = spark.read.parquet("/mnt/bronze/hosb/encounters")

#union two encounters dataframes
df_merged = df_hosa.unionByName(df_hosb)
display(df_merged)

df_merged.createOrReplaceTempView("encounters")

In [0]:
%sql
-- Temp view for quality checks
CREATE OR REPLACE TEMPORARY VIEW quality_checks AS
SELECT
    concat(EncounterID,'_',datasource) AS EncounterID,
    EncounterID AS SRC_EncounterID,
    PatientID,
    EncounterDate,
    EncounterType,
    ProviderID,
    DepartmentID,
    ProcedureCode,
    InsertedDate AS SRC_InsertedDate,
    ModifiedDate AS SRC_ModifiedDate,
    datasource,
    CASE 
        WHEN EncounterID IS NULL OR PatientID IS NULL THEN TRUE
        ELSE FALSE
    END AS is_quarantined
FROM encounters

In [0]:
%sql
SELECT * FROM quality_checks

In [0]:
%sql
CREATE TABLE IF NOT EXISTS silver.encounters(
  EncounterID STRING,
  SRC_EncounterID STRING,
  PatientID STRING,
  EncounterDate DATE,
  EncounterType STRING,
  ProviderID STRING,
  DepartmentID STRING,
  ProcedureCode STRING,
  SRC_InsertedDate TIMESTAMP,
  SRC_ModifiedDate TIMESTAMP,
  datasource STRING,
  is_quarantined BOOLEAN,
  inserted_date TIMESTAMP,
  modified_date TIMESTAMP,
  is_current BOOLEAN
)

In [0]:
%sql
-- SCD Type 2: insert new/current records
MERGE INTO silver.encounters AS target
USING quality_checks AS source
ON target.EncounterID = source.EncounterID
AND target.is_current = true
WHEN MATCHED AND(
  target.SRC_EncounterID != source.SRC_EncounterID OR
  target.PatientID != source.PatientID OR
  target.EncounterDate != source.EncounterDate OR
  target.EncounterType != source.EncounterType OR
  target.ProviderID != source.ProviderID OR
  target.DepartmentID != source.DepartmentID OR
  target.ProcedureCode != source.ProcedureCode OR
  target.SRC_InsertedDate != source.SRC_InsertedDate OR
  target.SRC_ModifiedDate != source.SRC_ModifiedDate OR
  target.datasource != source.datasource OR
  target.is_quarantined != source.is_quarantined
) THEN UPDATE SET
  target.is_current = false,
  target.modified_date = current_timestamp()

WHEN NOT MATCHED THEN
INSERT(
  EncounterID,
  SRC_EncounterID,
  PatientID,
  EncounterDate,
  EncounterType,
  ProviderID,
  DepartmentID,
  ProcedureCode,
  SRC_InsertedDate,
  SRC_ModifiedDate,
  datasource,
  is_quarantined,
  inserted_date,
  modified_date,
  is_current
) 
VALUES(
  source.EncounterID,
  source.SRC_EncounterID,
  source.PatientID,
  source.EncounterDate,
  source.EncounterType,
  source.ProviderID,
  source.DepartmentID,
  source.ProcedureCode,
  source.SRC_InsertedDate,
  source.SRC_ModifiedDate,
  source.datasource,
  source.is_quarantined,
  current_timestamp(),
  current_timestamp(),
  true
);