In [0]:
# MAGIC ## 1. Imports e Configura√ß√£o

# COMMAND ----------

import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from delta.tables import DeltaTable
from datetime import datetime

spark = SparkSession.builder \
    .appName("SilverBalancaComercial") \
    .getOrCreate()

In [0]:
# MAGIC ## 2. Configura√ß√£o de Storage (ABFSS - ADLS Gen2)

# COMMAND ----------

# ‚úÖ Recuperar Storage Account
TGT_STORAGE_ACCOUNT = dbutils.secrets.get(scope="acelera-grupo-5-kv", key="tgt-storage-account")

# ‚úÖ Containers
SILVER_CONTAINER = "silver"

# ‚úÖ USAR ABFSS:// (Azure Data Lake Storage Gen2) - MESMO PROTOCOLO DO BRONZE
SILVER_BASE_PATH = f"abfss://{SILVER_CONTAINER}@{TGT_STORAGE_ACCOUNT}.dfs.core.windows.net/balancacomercial"
CONTROL_TABLE_PATH = f"abfss://{SILVER_CONTAINER}@{TGT_STORAGE_ACCOUNT}.dfs.core.windows.net/metadata/silver_control_balanca"

print(f"‚úÖ Storage Account: {TGT_STORAGE_ACCOUNT}")
print(f"‚úÖ Silver Container: {SILVER_CONTAINER}")
print(f"‚úÖ Protocolo: abfss:// (ADLS Gen2)")
print(f"üìÇ Silver Base Path: {SILVER_BASE_PATH}")
print(f"üìÇ Control Table Path: {CONTROL_TABLE_PATH}")
print(f"üóÑÔ∏è Bronze: Leitura via Hive Metastore (bronze_balancacomercial)")

# COMMAND ----------

# üß™ VALIDAR ACESSO ao container Silver
print("\nüß™ Validando acesso ao container silver...")

try:
    test_path = f"abfss://{SILVER_CONTAINER}@{TGT_STORAGE_ACCOUNT}.dfs.core.windows.net/"
    files = dbutils.fs.ls(test_path)
    print(f"‚úÖ Container 'silver' acess√≠vel - {len(files)} itens encontrados")
    
except Exception as e:
    print(f"‚ùå ERRO: Cluster n√£o tem acesso ao container 'silver'")
    print(f"\n‚ö†Ô∏è Verifique:")
    print(f"   1. Container 'silver' existe no Storage Account")
    print(f"   2. Service Principal/Managed Identity tem permiss√µes")
    print(f"   3. Role necess√°ria: 'Storage Blob Data Contributor'")
    print(f"\nErro: {str(e)[:300]}")
    raise

In [0]:
# MAGIC ## 3. Otimiza√ß√µes Spark

# COMMAND ----------

spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")

print("‚úÖ Otimiza√ß√µes Spark configuradas")

In [0]:
# MAGIC ## 4. Cria√ß√£o de Databases

# COMMAND ----------

spark.sql("CREATE DATABASE IF NOT EXISTS silver_balancacomercial")
spark.sql("CREATE DATABASE IF NOT EXISTS metadata")

print("‚úÖ Databases criados no Hive Metastore")

In [0]:
# MAGIC ## 5. Cria√ß√£o da Tabela de Controle

# COMMAND ----------

spark.sql(f"""
    CREATE TABLE IF NOT EXISTS metadata.silver_control_balanca (
        table_name STRING COMMENT 'Nome da tabela Silver',
        last_ingestion_timestamp TIMESTAMP COMMENT '√öltimo timestamp processado',
        last_update TIMESTAMP COMMENT 'Timestamp da √∫ltima atualiza√ß√£o',
        records_processed BIGINT COMMENT 'Total de registros processados',
        execution_id STRING COMMENT 'ID da execu√ß√£o'
    )
    USING DELTA
    LOCATION '{CONTROL_TABLE_PATH}'
    COMMENT 'Controle incremental - Silver Balan√ßa Comercial'
""")

print("‚úÖ Tabela de controle criada")

# Verificar location
location = spark.sql("DESCRIBE DETAIL metadata.silver_control_balanca").select("location").first()[0]
print(f"üìÇ Location: {location}")

In [0]:
# MAGIC ## 6. Configura√ß√£o de Transforma√ß√µes

# COMMAND ----------

table_config = [
    # ========================================
    # DIMENS√ïES SIMPLES
    # ========================================
    
    {
        "bronze_table": "pais",
        "silver_table": "silver_pais",
        "key_columns": ["co_pais"],
        "column_mapping": {
            "CO_PAIS": {"rename": "co_pais", "cast": "string", "trim": True},
            "CO_PAIS_ISON3": {"rename": "co_pais_ison3", "cast": "string", "trim": True},
            "CO_PAIS_ISOA3": {"rename": "co_pais_isoa3", "cast": "string", "trim": True},
            "NO_PAIS": {"rename": "no_pais", "cast": "string", "trim": True},
            "NO_PAIS_ING": {"rename": "no_pais_ing", "cast": "string", "trim": True},
            "NO_PAIS_ESP": {"rename": "no_pais_esp", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Dimens√£o de pa√≠ses"
    },
    
    {
        "bronze_table": "via",
        "silver_table": "silver_via",
        "key_columns": ["co_via"],
        "column_mapping": {
            "CO_VIA": {"rename": "co_via", "cast": "string", "trim": True},
            "NO_VIA": {"rename": "no_via", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Dimens√£o de vias de transporte"
    },
    
    {
        "bronze_table": "urf",
        "silver_table": "silver_urf",
        "key_columns": ["co_urf"],
        "column_mapping": {
            "CO_URF": {"rename": "co_urf", "cast": "string", "trim": True},
            "NO_URF": {"rename": "no_urf", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Dimens√£o URF"
    },
    
    {
        "bronze_table": "ncm_unidade",
        "silver_table": "silver_ncm_unidade",
        "key_columns": ["co_unid"],
        "column_mapping": {
            "CO_UNID": {"rename": "co_unid", "cast": "string", "trim": True},
            "NO_UNID": {"rename": "no_unid", "cast": "string", "trim": True},
            "SG_UNID": {"rename": "sg_unid", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Dimens√£o de unidades"
    },
    
    {
        "bronze_table": "uf",
        "silver_table": "silver_uf",
        "key_columns": ["sg_uf"],
        "column_mapping": {
            "CO_UF": {"rename": "co_uf", "cast": "string", "trim": True},
            "SG_UF": {"rename": "sg_uf", "cast": "string", "trim": True},
            "NO_UF": {"rename": "no_uf", "cast": "string", "trim": True},
            "NO_REGIAO": {"rename": "no_regiao", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Dimens√£o de UF"
    },
    
    {
        "bronze_table": "uf_mun",
        "silver_table": "silver_uf_mun",
        "key_columns": ["co_mun_geo"],
        "column_mapping": {
            "CO_MUN_GEO": {"rename": "co_mun_geo", "cast": "string", "trim": True},
            "NO_MUN": {"rename": "no_mun", "cast": "string", "trim": True},
            "NO_MUN_MIN": {"rename": "no_mun_min", "cast": "string", "trim": True},
            "SG_UF": {"rename": "sg_uf", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Dimens√£o de munic√≠pios"
    },
    
    {
        "bronze_table": "isic_cuci",
        "silver_table": "silver_isic_cuci",
        "key_columns": ["co_isic_secao", "co_cuci_grupo"],
        "column_mapping": {
            "CO_ISIC_SECAO": {"rename": "co_isic_secao", "cast": "string", "trim": True},
            "NO_ISIC_SECAO": {"rename": "no_isic_secao", "cast": "string", "trim": True},
            "CO_CUCI_GRUPO": {"rename": "co_cuci_grupo", "cast": "string", "trim": True},
            "NO_CUCI_GRUPO": {"rename": "no_cuci_grupo", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "ISIC/CUCI"
    },
    
    {
        "bronze_table": "nbm",
        "silver_table": "silver_nbm",
        "key_columns": ["co_nbm"],
        "column_mapping": {
            "CO_NBM": {"rename": "co_nbm", "cast": "string", "trim": True},
            "NO_NBM": {"rename": "no_nbm", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "NBM"
    },
    
    {
        "bronze_table": "ncm_ppe",
        "silver_table": "silver_ncm_ppe",
        "key_columns": ["co_ppe"],
        "column_mapping": {
            "CO_PPE": {"rename": "co_ppe", "cast": "string", "trim": True},
            "NO_PPE": {"rename": "no_ppe", "cast": "string", "trim": True},
            "NO_PPE_MIN": {"rename": "no_ppe_min", "cast": "string", "trim": True},
            "NO_PPE_ING": {"rename": "no_ppe_ing", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "PPE"
    },
    
    {
        "bronze_table": "ncm_ppi",
        "silver_table": "silver_ncm_ppi",
        "key_columns": ["co_ppi"],
        "column_mapping": {
            "CO_PPI": {"rename": "co_ppi", "cast": "string", "trim": True},
            "NO_PPI": {"rename": "no_ppi", "cast": "string", "trim": True},
            "NO_PPI_MIN": {"rename": "no_ppi_min", "cast": "string", "trim": True},
            "NO_PPI_ING": {"rename": "no_ppi_ing", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "PPI"
    },
    
    {
        "bronze_table": "ncm_fat_agreg",
        "silver_table": "silver_ncm_fat_agreg",
        "key_columns": ["co_fat_agreg"],
        "column_mapping": {
            "CO_FAT_AGREG": {"rename": "co_fat_agreg", "cast": "string", "trim": True},
            "NO_FAT_AGREG": {"rename": "no_fat_agreg", "cast": "string", "trim": True},
            "NO_FAT_AGREG_GP": {"rename": "no_fat_agreg_gp", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Fator agrega√ß√£o"
    },
    
    {
        "bronze_table": "nbm_ncm",
        "silver_table": "silver_nbm_ncm",
        "key_columns": ["co_nbm"],
        "column_mapping": {
            "CO_NBM": {"rename": "co_nbm", "cast": "string", "trim": True},
            "CO_NCM": {"rename": "co_ncm", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "NBM-NCM"
    },
    
    {
        "bronze_table": "ncm_sh",
        "silver_table": "silver_ncm_sh",
        "key_columns": ["co_sh6"],
        "column_mapping": {
            "CO_SH6": {"rename": "co_sh6", "cast": "string", "trim": True},
            "NO_SH6_POR": {"rename": "no_sh6_por", "cast": "string", "trim": True},
            "NO_SH6_ESP": {"rename": "no_sh6_esp", "cast": "string", "trim": True},
            "NO_SH6_ING": {"rename": "no_sh6_ing", "cast": "string", "trim": True},
            "CO_SH4": {"rename": "co_sh4", "cast": "string", "trim": True},
            "NO_SH4_POR": {"rename": "no_sh4_por", "cast": "string", "trim": True},
            "NO_SH4_ESP": {"rename": "no_sh4_esp", "cast": "string", "trim": True},
            "NO_SH4_ING": {"rename": "no_sh4_ing", "cast": "string", "trim": True},
            "CO_SH2": {"rename": "co_sh2", "cast": "string", "trim": True},
            "NO_SH2_POR": {"rename": "no_sh2_por", "cast": "string", "trim": True},
            "NO_SH2_ESP": {"rename": "no_sh2_esp", "cast": "string", "trim": True},
            "NO_SH2_ING": {"rename": "no_sh2_ing", "cast": "string", "trim": True},
            "CO_NCM_SECROM": {"rename": "co_ncm_secrom", "cast": "string", "trim": True},
            "NO_SEC_POR": {"rename": "no_sec_por", "cast": "string", "trim": True},
            "NO_SEC_ESP": {"rename": "no_sec_esp", "cast": "string", "trim": True},
            "NO_SEC_ING": {"rename": "no_sec_ing", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Sistema Harmonizado"
    },
    
    {
        "bronze_table": "ncm_cgce",
        "silver_table": "silver_ncm_cgce",
        "key_columns": ["co_cgce_n3"],
        "column_mapping": {
            "CO_CGCE_N3": {"rename": "co_cgce_n3", "cast": "string", "trim": True},
            "NO_CGCE_N3": {"rename": "no_cgce_n3", "cast": "string", "trim": True},
            "NO_CGCE_N3_ING": {"rename": "no_cgce_n3_ing", "cast": "string", "trim": True},
            "NO_CGCE_N3_ESP": {"rename": "no_cgce_n3_esp", "cast": "string", "trim": True},
            "CO_CGCE_N2": {"rename": "co_cgce_n2", "cast": "string", "trim": True},
            "NO_CGCE_N2": {"rename": "no_cgce_n2", "cast": "string", "trim": True},
            "NO_CGCE_N2_ING": {"rename": "no_cgce_n2_ing", "cast": "string", "trim": True},
            "NO_CGCE_N2_ESP": {"rename": "no_cgce_n2_esp", "cast": "string", "trim": True},
            "CO_CGCE_N1": {"rename": "co_cgce_n1", "cast": "string", "trim": True},
            "NO_CGCE_N1": {"rename": "no_cgce_n1", "cast": "string", "trim": True},
            "NO_CGCE_N1_ING": {"rename": "no_cgce_n1_ing", "cast": "string", "trim": True},
            "NO_CGCE_N1_ESP": {"rename": "no_cgce_n1_esp", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "CGCE"
    },
    
    {
        "bronze_table": "ncm_cuci",
        "silver_table": "silver_ncm_cuci",
        "key_columns": ["co_cuci_item"],
        "column_mapping": {
            "CO_CUCI_ITEM": {"rename": "co_cuci_item", "cast": "string", "trim": True},
            "NO_CUCI_ITEM": {"rename": "no_cuci_item", "cast": "string", "trim": True},
            "CO_CUCI_SUB": {"rename": "co_cuci_sub", "cast": "string", "trim": True},
            "NO_CUCI_SUB": {"rename": "no_cuci_sub", "cast": "string", "trim": True},
            "CO_CUCI_GRUPO": {"rename": "co_cuci_grupo", "cast": "string", "trim": True},
            "NO_CUCI_GRUPO": {"rename": "no_cuci_grupo", "cast": "string", "trim": True},
            "CO_CUCI_DIVISAO": {"rename": "co_cuci_divisao", "cast": "string", "trim": True},
            "NO_CUCI_DIVISAO": {"rename": "no_cuci_divisao", "cast": "string", "trim": True},
            "CO_CUCI_SEC": {"rename": "co_cuci_sec", "cast": "string", "trim": True},
            "NO_CUCI_SEC": {"rename": "no_cuci_sec", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "CUCI"
    },
    
    {
        "bronze_table": "ncm_isic",
        "silver_table": "silver_ncm_isic",
        "key_columns": ["co_isic_classe"],
        "column_mapping": {
            "CO_ISIC_CLASSE": {"rename": "co_isic_classe", "cast": "string", "trim": True},
            "NO_ISIC_CLASSE": {"rename": "no_isic_classe", "cast": "string", "trim": True},
            "NO_ISIC_CLASSE_ING": {"rename": "no_isic_classe_ing", "cast": "string", "trim": True},
            "NO_ISIC_CLASSE_ESP": {"rename": "no_isic_classe_esp", "cast": "string", "trim": True},
            "CO_ISIC_GRUPO": {"rename": "co_isic_grupo", "cast": "string", "trim": True},
            "NO_ISIC_GRUPO": {"rename": "no_isic_grupo", "cast": "string", "trim": True},
            "NO_ISIC_GRUPO_ING": {"rename": "no_isic_grupo_ing", "cast": "string", "trim": True},
            "NO_ISIC_GRUPO_ESP": {"rename": "no_isic_grupo_esp", "cast": "string", "trim": True},
            "CO_ISIC_DIVISAO": {"rename": "co_isic_divisao", "cast": "string", "trim": True},
            "NO_ISIC_DIVISAO": {"rename": "no_isic_divisao", "cast": "string", "trim": True},
            "NO_ISIC_DIVISAO_ING": {"rename": "no_isic_divisao_ing", "cast": "string", "trim": True},
            "NO_ISIC_DIVISAO_ESP": {"rename": "no_isic_divisao_esp", "cast": "string", "trim": True},
            "CO_ISIC_SECAO": {"rename": "co_isic_secao", "cast": "string", "trim": True},
            "NO_ISIC_SECAO": {"rename": "no_isic_secao", "cast": "string", "trim": True},
            "NO_ISIC_SECAO_ING": {"rename": "no_isic_secao_ing", "cast": "string", "trim": True},
            "NO_ISIC_SECAO_ESP": {"rename": "no_isic_secao_esp", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "ISIC"
    },
    
    {
        "bronze_table": "ncm",
        "silver_table": "silver_ncm",
        "key_columns": ["co_ncm"],
        "column_mapping": {
            "CO_NCM": {"rename": "co_ncm", "cast": "string", "trim": True},
            "NO_NCM_POR": {"rename": "no_ncm_por", "cast": "string", "trim": True},
            "NO_NCM_ESP": {"rename": "no_ncm_esp", "cast": "string", "trim": True},
            "NO_NCM_ING": {"rename": "no_ncm_ing", "cast": "string", "trim": True},
            "CO_UNID": {"rename": "co_unid", "cast": "string", "trim": True},
            "CO_SH6": {"rename": "co_sh6", "cast": "string", "trim": True},
            "CO_PPE": {"rename": "co_ppe", "cast": "string", "trim": True},
            "CO_PPI": {"rename": "co_ppi", "cast": "string", "trim": True},
            "CO_FAT_AGREG": {"rename": "co_fat_agreg", "cast": "string", "trim": True},
            "CO_CUCI_ITEM": {"rename": "co_cuci_item", "cast": "string", "trim": True},
            "CO_CGCE_N3": {"rename": "co_cgce_n3", "cast": "string", "trim": True},
            "CO_SIIT": {"rename": "co_siit", "cast": "string", "trim": True},
            "CO_ISIC_CLASSE": {"rename": "co_isic_classe", "cast": "string", "trim": True},
            "CO_EXP_SUBSET": {"rename": "co_exp_subset", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "NCM"
    },
    
    {
        "bronze_table": "pais_bloco",
        "silver_table": "silver_pais_bloco",
        "key_columns": ["co_pais", "co_bloco"],
        "column_mapping": {
            "CO_PAIS": {"rename": "co_pais", "cast": "string", "trim": True},
            "CO_BLOCO": {"rename": "co_bloco", "cast": "string", "trim": True},
            "NO_BLOCO": {"rename": "no_bloco", "cast": "string", "trim": True},
            "NO_BLOCO_ING": {"rename": "no_bloco_ing", "cast": "string", "trim": True},
            "NO_BLOCO_ESP": {"rename": "no_bloco_esp", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Pa√≠s-Bloco"
    },
    
    # ========================================
    # FATOS
    # ========================================
    
    {
        "bronze_table": "exp",
        "silver_table": "silver_exp",
        "key_columns": ["ano", "mes", "co_ncm", "co_pais", "sigla_uf_ncm", "co_via", "co_urf"],
        "column_mapping": {
            "CO_ANO": {"rename": "ano", "cast": "int", "trim": False},
            "CO_MES": {"rename": "mes", "cast": "int", "trim": False},
            "CO_NCM": {"rename": "co_ncm", "cast": "string", "trim": True},
            "CO_UNID": {"rename": "co_unid", "cast": "string", "trim": True},
            "CO_PAIS": {"rename": "co_pais", "cast": "string", "trim": True},
            "SG_UF_NCM": {"rename": "sigla_uf_ncm", "cast": "string", "trim": True},
            "CO_VIA": {"rename": "co_via", "cast": "string", "trim": True},
            "CO_URF": {"rename": "co_urf", "cast": "string", "trim": True},
            "QT_ESTAT": {"rename": "qt_estat", "cast": "bigint", "trim": False},
            "KG_LIQUIDO": {"rename": "kg_liquido", "cast": "bigint", "trim": False},
            "VL_FOB": {"rename": "vl_fob", "cast": "bigint", "trim": False}
        },
        "partition_by": ["ano", "mes"],
        "description": "Fato Exporta√ß√£o"
    },
    
    {
        "bronze_table": "imp",
        "silver_table": "silver_imp",
        "key_columns": ["ano", "mes", "co_ncm", "co_pais", "sigla_uf_ncm", "co_via", "co_urf"],
        "column_mapping": {
            "CO_ANO": {"rename": "ano", "cast": "int", "trim": False},
            "CO_MES": {"rename": "mes", "cast": "int", "trim": False},
            "CO_NCM": {"rename": "co_ncm", "cast": "string", "trim": True},
            "CO_UNID": {"rename": "co_unid", "cast": "string", "trim": True},
            "CO_PAIS": {"rename": "co_pais", "cast": "string", "trim": True},
            "SG_UF_NCM": {"rename": "sigla_uf_ncm", "cast": "string", "trim": True},
            "CO_VIA": {"rename": "co_via", "cast": "string", "trim": True},
            "CO_URF": {"rename": "co_urf", "cast": "string", "trim": True},
            "QT_ESTAT": {"rename": "qt_estat", "cast": "bigint", "trim": False},
            "KG_LIQUIDO": {"rename": "kg_liquido", "cast": "bigint", "trim": False},
            "VL_FOB": {"rename": "vl_fob", "cast": "bigint", "trim": False},
            "VL_FRETE": {"rename": "vl_frete", "cast": "bigint", "trim": False},
            "VL_SEGURO": {"rename": "vl_seguro", "cast": "bigint", "trim": False}
        },
        "partition_by": ["ano", "mes"],
        "description": "Fato Importa√ß√£o"
    },
    
    {
        "bronze_table": "exp_mun",
        "silver_table": "silver_exp_mun",
        "key_columns": ["ano", "mes", "co_sh4", "co_pais", "sigla_uf_mun", "co_mun"],
        "column_mapping": {
            "CO_ANO": {"rename": "ano", "cast": "int", "trim": False},
            "CO_MES": {"rename": "mes", "cast": "int", "trim": False},
            "SH4": {"rename": "co_sh4", "cast": "string", "trim": True},
            "CO_PAIS": {"rename": "co_pais", "cast": "string", "trim": True},
            "SG_UF_MUN": {"rename": "sigla_uf_mun", "cast": "string", "trim": True},
            "CO_MUN": {"rename": "co_mun", "cast": "string", "trim": True},
            "KG_LIQUIDO": {"rename": "kg_liquido", "cast": "bigint", "trim": False},
            "VL_FOB": {"rename": "vl_fob", "cast": "bigint", "trim": False}
        },
        "partition_by": ["ano", "mes"],
        "description": "Fato Exp Municipal"
    },
    
    {
        "bronze_table": "imp_mun",
        "silver_table": "silver_imp_mun",
        "key_columns": ["ano", "mes", "co_sh4", "co_pais", "sigla_uf_mun", "co_mun"],
        "column_mapping": {
            "CO_ANO": {"rename": "ano", "cast": "int", "trim": False},
            "CO_MES": {"rename": "mes", "cast": "int", "trim": False},
            "SH4": {"rename": "co_sh4", "cast": "string", "trim": True},
            "CO_PAIS": {"rename": "co_pais", "cast": "string", "trim": True},
            "SG_UF_MUN": {"rename": "sigla_uf_mun", "cast": "string", "trim": True},
            "CO_MUN": {"rename": "co_mun", "cast": "string", "trim": True},
            "KG_LIQUIDO": {"rename": "kg_liquido", "cast": "bigint", "trim": False},
            "VL_FOB": {"rename": "vl_fob", "cast": "bigint", "trim": False}
        },
        "partition_by": ["ano", "mes"],
        "description": "Fato Imp Municipal"
    }
]

print(f"‚úÖ {len(table_config)} tabelas configuradas")

In [0]:
# MAGIC ## 7. Fun√ß√µes de Transforma√ß√£o

# COMMAND ----------

def get_last_processed_timestamp(silver_table_name: str) -> datetime:
    """
    Recupera √∫ltimo timestamp processado da tabela de controle
    
    Args:
        silver_table_name: Nome da tabela Silver
        
    Returns:
        √öltimo timestamp processado ou None se primeira execu√ß√£o
    """
    try:
        row = (
            spark.table("metadata.silver_control_balanca")
            .filter(F.col("table_name") == silver_table_name)
            .select("last_ingestion_timestamp")
            .orderBy(F.col("last_update").desc())
            .limit(1)
            .collect()
        )
        
        if row:
            timestamp = row[0]["last_ingestion_timestamp"]
            print(f"üìÖ √öltimo processamento: {timestamp}")
            return timestamp
        else:
            print(f"üÜï Primeira execu√ß√£o")
            return None
            
    except Exception as e:
        print(f"‚ö†Ô∏è Controle vazio: {e}")
        return None


def get_bronze_incremental_latest(bronze_table: str, pk_columns: list, last_processed_at: datetime):
    """
    L√™ dados incrementais do Bronze via HIVE METASTORE
    
    Args:
        bronze_table: Nome da tabela Bronze (sem database)
        pk_columns: Lista de colunas da chave prim√°ria
        last_processed_at: Timestamp do √∫ltimo processamento (watermark)
        
    Returns:
        DataFrame com dados novos (DEDUPLICA√á√ÉO SER√Å FEITA DEPOIS DA TRANSFORMA√á√ÉO)
    """
    
    # ‚úÖ Leitura via Hive Metastore
    full_table_name = f"bronze_balancacomercial.{bronze_table}"
    
    try:
        df_bronze = spark.table(full_table_name)
    except Exception as e:
        print(f"‚ùå Tabela Bronze n√£o existe: {full_table_name}")
        raise
    
    # Filtro incremental
    if last_processed_at:
        df_filtered = df_bronze.filter(F.col("_ingest_ts") > F.lit(last_processed_at))
        records_filtered = df_filtered.count()
        print(f"üìä Registros incrementais: {records_filtered:,}")
    else:
        df_filtered = df_bronze
        records_total = df_filtered.count()
        print(f"üìä Carga FULL: {records_total:,}")
    
    # ‚úÖ RETORNA SEM DEDUPLICA√á√ÉO
    # A deduplica√ß√£o ser√° feita DEPOIS da transforma√ß√£o de colunas
    # para evitar conflito entre nomes Bronze (CO_ANO) e Silver (ano)
    
    return df_filtered


def apply_column_mapping(df, column_mapping: dict):
    """
    Aplica transforma√ß√µes de colunas (cast, rename, trim)
    
    Args:
        df: DataFrame Bronze
        column_mapping: Dicion√°rio com regras de transforma√ß√£o
        
    Returns:
        DataFrame transformado
    """
    select_exprs = []
    
    for source_col, rules in column_mapping.items():
        if source_col not in df.columns:
            print(f"‚ö†Ô∏è Coluna {source_col} n√£o encontrada - ignorando")
            continue
        
        # Suporte para mapeamento simples (string)
        if isinstance(rules, str):
            select_exprs.append(F.col(source_col).alias(rules))
            continue
        
        # Mapeamento completo (dict)
        target_col = rules.get("rename", source_col)
        cast_type = rules.get("cast")
        trim_flag = rules.get("trim", False)
        
        col_expr = F.col(source_col)
        
        # Aplicar trim se string
        if trim_flag:
            col_expr = F.trim(col_expr)
        
        # Aplicar cast
        if cast_type:
            col_expr = col_expr.cast(cast_type)
        
        select_exprs.append(col_expr.alias(target_col))
    
    # Preservar metadados t√©cnicos
    technical_cols = ["ingest_date", "_ingest_ts", "_source_path"]
    for col_name in technical_cols:
        if col_name in df.columns:
            select_exprs.append(F.col(col_name))
    
    return df.select(*select_exprs)


def deduplicate_by_pk(df, pk_columns: list):
    """
    Deduplica DataFrame por chave prim√°ria (mant√©m registro mais recente)
    
    Args:
        df: DataFrame j√° transformado (com nomes Silver)
        pk_columns: Lista de colunas da chave prim√°ria (nomes Silver)
        
    Returns:
        DataFrame deduplicado
    """
    
    # Window por PK, ordenado por timestamp descendente
    window_pk = Window.partitionBy(*pk_columns).orderBy(F.col("_ingest_ts").desc())
    
    df_dedup = (
        df
        .withColumn("_rn", F.row_number().over(window_pk))
        .filter(F.col("_rn") == 1)
        .drop("_rn")
    )
    
    records_dedup = df_dedup.count()
    print(f"‚úÖ Registros √∫nicos ap√≥s deduplica√ß√£o: {records_dedup:,}")
    
    return df_dedup


def merge_into_silver(df_source, silver_table_name: str, pk_columns: list, partition_by: list):
    """
    Executa MERGE na Silver e registra no Hive Metastore
    
    Args:
        df_source: DataFrame transformado
        silver_table_name: Nome da tabela Silver (ex: 'silver_pais')
        pk_columns: Colunas da chave prim√°ria
        partition_by: Colunas para particionamento (ou None)
    """
    
    # Path f√≠sico no ADLS Gen2
    silver_path = f"{SILVER_BASE_PATH}/{silver_table_name}"
    full_table_name = f"silver_balancacomercial.{silver_table_name}"
    
    # Verificar se tabela existe
    if not DeltaTable.isDeltaTable(spark, silver_path):
        print(f"üÜï Criando tabela: {full_table_name}")
        
        # Escrever arquivos Delta
        writer = df_source.write.format("delta").mode("overwrite")
        
        if partition_by:
            writer = writer.partitionBy(*partition_by)
            print(f"üìÇ Particionamento: {partition_by}")
        
        writer.save(silver_path)
        
        # ‚úÖ Registrar no Hive Metastore
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {full_table_name}
            USING DELTA
            LOCATION '{silver_path}'
        """)
        
        print(f"‚úÖ Tabela criada e registrada no Hive")
        return
    
    # MERGE incremental (SCD Type 1)
    print(f"üîÑ Executando MERGE: {full_table_name}")
    
    delta_silver = DeltaTable.forPath(spark, silver_path)
    merge_condition = " AND ".join([f"silver.{pk} = source.{pk}" for pk in pk_columns])
    
    # Colunas para update (todas exceto PKs)
    update_columns = {col: f"source.{col}" for col in df_source.columns if col not in pk_columns}
    
    # Colunas para insert (todas)
    insert_columns = {col: f"source.{col}" for col in df_source.columns}
    
    (
        delta_silver.alias("silver")
        .merge(df_source.alias("source"), merge_condition)
        .whenMatchedUpdate(set=update_columns)
        .whenNotMatchedInsert(values=insert_columns)
        .execute()
    )
    
    print(f"‚úÖ MERGE conclu√≠do")


def update_control_table(silver_table_name: str, max_timestamp: datetime, records_count: int, execution_id: str):
    """
    Atualiza tabela de controle com watermark
    
    Args:
        silver_table_name: Nome da tabela Silver
        max_timestamp: Timestamp m√°ximo processado
        records_count: Quantidade de registros processados
        execution_id: ID da execu√ß√£o
    """
    
    # ‚úÖ CALCULAR timestamp atual ANTES de criar tupla
    current_ts = datetime.now()
    
    control_data = [(silver_table_name, max_timestamp, current_ts, records_count, execution_id)]
    df_new = spark.createDataFrame(
        control_data,
        ["table_name", "last_ingestion_timestamp", "last_update", "records_processed", "execution_id"]
    )
    
    if DeltaTable.isDeltaTable(spark, CONTROL_TABLE_PATH):
        delta_control = DeltaTable.forPath(spark, CONTROL_TABLE_PATH)
        
        (
            delta_control.alias("t")
            .merge(df_new.alias("s"), "t.table_name = s.table_name")
            .whenMatchedUpdate(set={
                "last_ingestion_timestamp": "s.last_ingestion_timestamp",
                "last_update": "s.last_update",
                "records_processed": "t.records_processed + s.records_processed",
                "execution_id": "s.execution_id"
            })
            .whenNotMatchedInsert(values={
                "table_name": "s.table_name",
                "last_ingestion_timestamp": "s.last_ingestion_timestamp",
                "last_update": "s.last_update",
                "records_processed": "s.records_processed",
                "execution_id": "s.execution_id"
            })
            .execute()
        )
        
        print(f"üìù Controle atualizado")
    else:
        df_new.write.format("delta").mode("overwrite").save(CONTROL_TABLE_PATH)
        print(f"üÜï Controle inicializado")

In [0]:
# MAGIC ## 8. Processamento Incremental

# COMMAND ----------

execution_id = datetime.now().strftime("%Y%m%d_%H%M%S")
print(f"üöÄ Execution ID: {execution_id}")
print(f"{'='*80}\n")

# Estat√≠sticas da execu√ß√£o
total_tables_processed = 0
total_records_processed = 0
tables_created = []
tables_updated = []
tables_failed = []

for config in table_config:
    
    bronze_table = config["bronze_table"]
    silver_table = config["silver_table"]
    pk_cols = config["key_columns"]
    column_mapping = config["column_mapping"]
    partition_by = config["partition_by"]
    description = config["description"]
    
    print(f"\n{'='*80}")
    print(f"üìã Bronze: bronze_balancacomercial.{bronze_table}")
    print(f"üìã Silver: silver_balancacomercial.{silver_table}")
    print(f"üìù Descri√ß√£o: {description}")
    print(f"{'='*80}")
    
    try:
        # 1. Verificar se Bronze existe
        try:
            spark.table(f"bronze_balancacomercial.{bronze_table}").limit(1).count()
        except:
            print(f"‚ö†Ô∏è Tabela Bronze n√£o existe: bronze_balancacomercial.{bronze_table}")
            tables_failed.append(silver_table)
            continue
        
        # 2. Recuperar √∫ltimo timestamp processado
        last_processed_at = get_last_processed_timestamp(silver_table)
        
        # 3. Ler dados incrementais do Bronze (SEM deduplica√ß√£o ainda)
        df_latest = get_bronze_incremental_latest(bronze_table, pk_cols, last_processed_at)
        
        # 4. Verificar se h√° dados novos
        if df_latest.count() == 0:
            print(f"‚ÑπÔ∏è Sem dados novos para processar")
            continue
        
        # 5. Aplicar transforma√ß√µes (rename, cast, trim)
        df_transformed = apply_column_mapping(df_latest, column_mapping)
        
        # 6. ‚úÖ DEDUPLICAR AP√ìS TRANSFORMA√á√ÉO (com nomes Silver corretos)
        df_transformed = deduplicate_by_pk(df_transformed, pk_cols)
        
        # 7. Verificar se √© tabela nova
        is_new_table = not spark.catalog.tableExists(f"silver_balancacomercial.{silver_table}")
        
        # 8. MERGE na Silver
        merge_into_silver(df_transformed, silver_table, pk_cols, partition_by)
        
        # 9. Atualizar controle
        max_timestamp = df_transformed.agg(F.max("_ingest_ts")).collect()[0][0]
        records_count = df_transformed.count()
        
        update_control_table(silver_table, max_timestamp, records_count, execution_id)
        
        # 10. Registrar estat√≠sticas
        total_tables_processed += 1
        total_records_processed += records_count
        
        if is_new_table:
            tables_created.append(silver_table)
        else:
            tables_updated.append(silver_table)
        
        print(f"‚úÖ {silver_table} processado com sucesso!")
        
    except Exception as e:
        print(f"‚ùå ERRO ao processar {silver_table}")
        print(f"   Erro: {str(e)}")
        tables_failed.append(silver_table)
        
        import traceback
        traceback.print_exc()

print(f"\n{'='*80}")
print(f"üéâ PROCESSAMENTO CONCLU√çDO")
print(f"{'='*80}")

In [0]:
## 9. Relat√≥rio de Execu√ß√£o

# COMMAND ----------

print(f"""
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë            RELAT√ìRIO - SILVER BALAN√áA COMERCIAL                  ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù

üìä ESTAT√çSTICAS GERAIS:
   ‚Ä¢ Execution ID: {execution_id}
   ‚Ä¢ Tabelas processadas: {total_tables_processed}
   ‚Ä¢ Registros processados: {total_records_processed:,}
   ‚Ä¢ Protocolo: abfss:// (ADLS Gen2)

‚úÖ TABELAS CRIADAS ({len(tables_created)}):
{chr(10).join(f'   ‚Ä¢ {t}' for t in tables_created) if tables_created else '   (nenhuma)'}

üîÑ TABELAS ATUALIZADAS ({len(tables_updated)}):
{chr(10).join(f'   ‚Ä¢ {t}' for t in tables_updated) if tables_updated else '   (nenhuma)'}

{f'‚ùå TABELAS COM FALHA ({len(tables_failed)}):' if tables_failed else ''}
{chr(10).join(f'   ‚Ä¢ {t}' for t in tables_failed) if tables_failed else ''}

üóÑÔ∏è ARQUITETURA:
   ‚Ä¢ Bronze (leitura): Hive Metastore ‚Üí bronze_balancacomercial.*
   ‚Ä¢ Silver (escrita): ADLS Gen2 + Hive ‚Üí silver_balancacomercial.*
   ‚Ä¢ Controle: metadata.silver_control_balanca

üìÇ LOCALIZA√á√ÉO F√çSICA:
   ‚Ä¢ Silver Tables: {SILVER_BASE_PATH}
   ‚Ä¢ Control Table: {CONTROL_TABLE_PATH}

üîó CONEX√ÉO POWER BI:
   ‚Ä¢ Catalog: hive_metastore
   ‚Ä¢ Schema: silver_balancacomercial
   ‚Ä¢ Autentica√ß√£o: Configurada no cluster (Service Principal/Managed Identity)

‚è±Ô∏è PR√ìXIMA EXECU√á√ÉO:
   ‚Ä¢ Apenas dados com _ingestion_timestamp > √∫ltimo processamento
   ‚Ä¢ Processamento incremental autom√°tico
""")