In [0]:
from pyspark.sql import functions as F
from pyspark.sql import types as T
import uuid

In [0]:
spark.conf.set("spark.sql.session.timeZone", "UTC")

spark.sql("USE CATALOG `fomacao_microsoft_power_bi_profisional`")
spark.sql("USE `pessoas`")

batch_id = str(uuid.uuid4())

landing_path = "/Volumes/fomacao_microsoft_power_bi_profisional/pessoas/landing/processado/PESSOAS.csv"

In [0]:
df_raw = (spark.read
               .option("header", True)
               .option("encoding", "UTF-8")
               .option("inferSchema", True)
               .option("multiline", True)
               .option("mode", "PERMISSIVE")
               .csv(landing_path))

In [0]:
cols_for_hash = [c for c in df_raw.columns]

df_bronze = (df_raw.withColumn("_ingest_ts_utc", F.current_timestamp())
                   .withColumn("_ingest_date", F.to_date(F.col("_ingest_ts_utc")))
                   .withColumn("_source_path", F.col("_metadata.file_path"))
                   .withColumn("_source_file", F.regexp_extract(F.col("_source_path"), r"([^/]+)$", 1))
                   .withColumn("_batch_id", F.lit(batch_id))
                   .withColumn("_row_hash",
                               F.sha2(F.concat_ws("§", *[F.coalesce(F.col(c).cast("string"), F.lit("")) for c in cols_for_hash]), 256)
                               )
                   )

In [0]:
tbl = "bronze_pessoas"

if not spark.catalog.tableExists(tbl):
    (df_bronze.write
              .format("delta")
              .mode("overwrite")
              .option("mergeSchema", "true")
              .partitionBy("_ingest_date")
              .saveAsTable(tbl))
else:
    (df_bronze.write
              .format("delta")
              .mode("append")
              .option("mergeSchema", "true")
              .saveAsTable(tbl))
    
print(f"Bronze count: {spark.table(tbl).count()}")
spark.table(tbl).limit(5).display()