In [0]:
# Databricks notebook cell 1
from pyspark.sql import functions as F
from pyspark.sql.types import *
import json
import uuid
from datetime import datetime
dbutils.fs.mkdirs("dbfs:/FileStore/task1_input_files/")

True

In [0]:
raw_base = "dbfs:/FileStore/task1_input_files"
lake_base = "dbfs:/FileStore/fda_task1_lake"

raw_path = f"{lake_base}/raw"
curated_path = f"{lake_base}/curated"
provenance_path = f"{lake_base}/provenance"

dbutils.fs.mkdirs(raw_path)
dbutils.fs.mkdirs(curated_path)
dbutils.fs.mkdirs(provenance_path)

True

In [0]:
# Load Metadata, PHI Rules, Schemas
def load_json(path):
    return json.loads(dbutils.fs.head(path, 10000))

metadata = load_json(f"{raw_base}/metadata.json")
phi_rules = load_json(f"{raw_base}/phi_rules.json")
schema_defs = load_json(f"{raw_base}/schema_definitions.json")

metadata, phi_rules["version"], list(schema_defs.keys())

({'sources': [{'source_name': 'EHR_Clinical_Notes',
    'file': 'source1_clinical_notes.jsonl',
    'format': 'jsonl',
    'description': 'Synthetic clinical notes with PHI-like fields'},
   {'source_name': 'Lab_Reports',
    'file': 'source2_lab_reports.csv',
    'format': 'csv',
    'description': 'Synthetic lab results CSV'}]},
 'v1.0',
 ['source1_schema', 'source2_schema'])

In [0]:
# Define Spark Schemas
source1_schema = StructType([
    StructField("patient_id", StringType(), True),
    StructField("patient_name", StringType(), True),
    StructField("note_id", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("clinical_text", StringType(), True),
    StructField("doctor_name", StringType(), True),
    StructField("hospital", StringType(), True),
    StructField("mrn", StringType(), True)
])

source2_schema = StructType([
    StructField("patient_id", StringType(), True),
    StructField("report_id", StringType(), True),
    StructField("test_name", StringType(), True),
    StructField("result_value", DoubleType(), True),
    StructField("unit", StringType(), True),
    StructField("report_date", StringType(), True),
    StructField("lab_name", StringType(), True),
    StructField("technician", StringType(), True)
])

In [0]:
# Ingest JSONL and CSV
notes_df = spark.read.schema(source1_schema).json(f"{raw_base}/source1_clinical_notes.jsonl")
labs_df = spark.read.option("header",True).schema(source2_schema).csv(f"{raw_base}/source2_lab_reports.csv")

notes_df = notes_df.withColumn("timestamp_ts", F.to_timestamp("timestamp"))
labs_df = labs_df.withColumn("report_date_dt", F.to_date("report_date"))

In [0]:
# Apply PHI Redaction
patterns = phi_rules["patterns"]
redaction_format = phi_rules["redaction_format"]

def redact(col):
    expr = col
    for r in patterns:
        expr = F.regexp_replace(expr, r["pattern"], redaction_format.replace("{type}", r["type"].upper()))
    return expr

red_notes = notes_df.select(
    "patient_id","note_id","timestamp_ts",
    redact(F.col("clinical_text")).alias("clinical_text"),
    redact(F.col("doctor_name")).alias("doctor_name"),
    redact(F.col("patient_name")).alias("patient_name"),
    redact(F.col("hospital")).alias("hospital"),
    redact(F.col("mrn")).alias("mrn")
).withColumn("phi_rules_applied", F.array([F.lit(r["type"]) for r in patterns]))

red_labs = labs_df.select(
    "patient_id","report_id","report_date_dt",
    F.concat_ws(" | ", "test_name", F.col("result_value").cast("string"), "unit").alias("text"),
    redact(F.col("lab_name")).alias("hospital"),
    F.lit(None).alias("doctor_name"),
    F.array([F.lit(r["type"]) for r in patterns]).alias("phi_rules_applied")
)

In [0]:
# Build QLM-Ready Dataset
qlm_notes = red_notes.select(
    "patient_id",
    F.col("note_id").alias("source_record_id"),
    F.col("timestamp_ts").alias("event_timestamp"),
    "clinical_text",
    "doctor_name",
    "hospital",
    "phi_rules_applied"
).withColumn("source_system", F.lit("EHR_Notes")).withColumn("record_type", F.lit("clinical_note"))

qlm_labs = red_labs.select(
    "patient_id",
    F.col("report_id").alias("source_record_id"),
    "report_date_dt",
    F.col("text").alias("text"),
    "doctor_name",
    "hospital",
    "phi_rules_applied"
).withColumn("source_system", F.lit("Labs")).withColumn("record_type", F.lit("lab_report"))

qlm_df = qlm_notes.unionByName(qlm_labs, allowMissingColumns=True)

In [0]:
# Row Hash and Batch Hash
batch_id = str(uuid.uuid4())
ingested_at = datetime.utcnow().isoformat()

fields_to_hash = ["patient_id","source_record_id","event_timestamp","text","source_system","record_type"]

qlm_hashed = qlm_df.withColumn(
    "row_hash",
    F.sha2(F.concat_ws("||", *[F.coalesce(F.col(c).cast("string"), F.lit("")) for c in fields_to_hash]), 256)
).withColumn("batch_id", F.lit(batch_id))

batch_info = qlm_hashed.agg(
    F.sha2(F.concat_ws("", F.sort_array(F.collect_list("row_hash"))), 256).alias("batch_sha256"),
    F.count("*").alias("row_count")
).collect()[0]

In [0]:
# Save QLM Data & Provenance
qlm_path = f"{curated_path}/qlm_ready/batch_id={batch_id}"
prov_path = f"{provenance_path}/run_id={batch_id}"

qlm_hashed.write.mode("overwrite").parquet(qlm_path)

prov_data = [{
    "batch_id": batch_id,
    "ingested_at": ingested_at,
    "sources": ["EHR_Clinical_Notes","Lab_Reports"],
    "source_files": ["source1_clinical_notes.jsonl","source2_lab_reports.csv"],
    "phi_rules_applied": [r["type"] for r in patterns],
    "row_count": batch_info["row_count"],
    "batch_sha256": batch_info["batch_sha256"],
    "qlm_output_path": qlm_path
}]

prov_df = spark.createDataFrame(prov_data)
prov_df.write.mode("append").parquet(prov_path)

In [0]:
# Verify Integrity
loaded = spark.read.parquet(qlm_path)

recomputed = loaded.agg(
    F.sha2(F.concat_ws("", F.sort_array(F.collect_list("row_hash"))), 256)
).collect()[0][0]

print("Stored :", batch_info["batch_sha256"])
print("Recomputed:", recomputed)

assert recomputed == batch_info["batch_sha256"], "❌ Integrity check failed!"
print("✔ Integrity Verified")

Stored : 3556e64d22654aa66bbfb41aeaada2f92048d0583883e6e5b740f8611d49d7ed
Recomputed: 3556e64d22654aa66bbfb41aeaada2f92048d0583883e6e5b740f8611d49d7ed
✔ Integrity Verified


In [0]:
# Get provenance as Python dictionary
sample = prov_df.toPandas().to_dict(orient="records")[0]

# Convert any numpy arrays to python lists (json serializable)
for key, value in sample.items():
    if hasattr(value, "tolist"):
        sample[key] = value.tolist()

import json
pretty = json.dumps(sample, indent=2)

print(pretty)

# Save JSON to DBFS
dbutils.fs.put(f"{prov_path}/provenance_sample.json", pretty, True)

{
  "batch_id": "0df5c6cc-7bc7-4318-bd0f-fb79dab20e11",
  "batch_sha256": "3556e64d22654aa66bbfb41aeaada2f92048d0583883e6e5b740f8611d49d7ed",
  "ingested_at": "2025-11-14T06:45:20.987530",
  "phi_rules_applied": [
    "doctor_name",
    "patient_name",
    "phone_number",
    "mrn",
    "address"
  ],
  "qlm_output_path": "dbfs:/FileStore/fda_task1_lake/curated/qlm_ready/batch_id=0df5c6cc-7bc7-4318-bd0f-fb79dab20e11",
  "row_count": 400,
  "source_files": [
    "source1_clinical_notes.jsonl",
    "source2_lab_reports.csv"
  ],
  "sources": [
    "EHR_Clinical_Notes",
    "Lab_Reports"
  ]
}
Wrote 597 bytes.


True