In [0]:
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from delta.tables import DeltaTable
from datetime import datetime

spark = SparkSession.builder \
    .appName("Bronze_corr_ncm_cnae") \
    .getOrCreate()

In [0]:
# ✅ Recuperar Storage Account
TGT_STORAGE_ACCOUNT = dbutils.secrets.get(scope="acelera-grupo-5-kv", key="tgt-storage-account")

# ✅ Containers
CONTAINER = "landing-corr-ncm-cnae"

BASE_PATH = f"abfss://{CONTAINER}@{TGT_STORAGE_ACCOUNT}.dfs.core.windows.net/"

# ✅ Origem (landing)
SOURCE_PATH = BASE_PATH

# ✅ Destino Bronze
BRONZE_PATH = f"abfss://bronze@{TGT_STORAGE_ACCOUNT}.dfs.core.windows.net/corr_ncm_cnae"

# ✅ Checkpoint (mesmo container, pasta separada)
CHECKPOINT_PATH = f"abfss://bronze@{TGT_STORAGE_ACCOUNT}.dfs.core.windows.net/_checkpoints/corr_ncm_cnae"

In [0]:
import pandas as pd
import pyspark.sql.functions as F
from io import BytesIO

files = [
    f.path for f in dbutils.fs.ls(SOURCE_PATH)
    if f.path.lower().endswith((".xls", ".xlsx"))
]

dfs = []

for path in files:
    engine = "xlrd" if path.lower().endswith(".xls") else "openpyxl"

    # ✅ Lê como binário via Spark
    binary_df = (
        spark.read
        .format("binaryFile")
        .load(path)
    )

    content = binary_df.select("content").first()["content"]

    pdf = pd.read_excel(
        BytesIO(content),
        skiprows=1,
        dtype=str,
        engine=engine
    )

    pdf = pdf.loc[:, ~pdf.columns.str.contains("Unnamed")]
    pdf.columns = ["NCM", "NCM_DESC", "CNAE"]

    sdf = spark.createDataFrame(pdf)

    sdf = (
        sdf
        .withColumn("CNAE", F.explode(F.split(F.col("CNAE"), ";")))
        .withColumn("CNAE", F.trim(F.col("CNAE")))
        .withColumn("CNAE", F.regexp_replace(F.col("CNAE"), r"[.\-]", ""))
        .withColumn(
            "CNAE",
            F.when(
                F.col("CNAE").rlike(r"^\d+"),
                F.regexp_extract(F.col("CNAE"), r"^(\d+)", 1)
            )
        )
        .withColumn("_ingestion_date", F.current_date())
        .withColumn("_ingestion_timestamp", F.current_timestamp())
        .withColumn("_source_path", F.lit(path))
    )

    dfs.append(sdf)

In [0]:
if dfs:
    final_df = dfs[0]
    for df in dfs[1:]:
        final_df = final_df.unionByName(df)

In [0]:
from delta.tables import DeltaTable

if not DeltaTable.isDeltaTable(spark, BRONZE_PATH):
    (
        final_df.write
        .format("delta")
        .mode("overwrite")
        .partitionBy("_ingestion_date")
        .save(BRONZE_PATH)
    )
else:
    (
        final_df.write
        .format("delta")
        .mode("append")
        .save(BRONZE_PATH)
    )

In [0]:
%sql
DROP SCHEMA IF EXISTS bronze_balancacomercial CASCADE;

In [0]:
spark.sql(f"""
CREATE SCHEMA IF NOT EXISTS bronze;
""")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS bronze.corr_ncm_cnae
USING DELTA
LOCATION 'abfss://bronze@{TGT_STORAGE_ACCOUNT}.dfs.core.windows.net/corr_ncm_cnae'
""")

In [0]:
%sql
SELECT * FROM silver.corr_ncm_cnae