In [0]:
# 1. Imports e Configura√ß√£o

# COMMAND ----------

import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from delta.tables import DeltaTable
from datetime import datetime

spark = SparkSession.builder \
    .appName("SilverCNPJ") \
    .getOrCreate()


In [0]:
# MAGIC ## 2. Configura√ß√£o de Storage (ABFSS - ADLS Gen2)

# COMMAND ----------

# ‚úÖ Recuperar Storage Account
TGT_STORAGE_ACCOUNT = dbutils.secrets.get(scope="acelera-grupo-5-kv", key="tgt-storage-account")

# ‚úÖ Containers
SILVER_CONTAINER = "silver"

# ‚úÖ USAR ABFSS:// (Azure Data Lake Storage Gen2) - MESMO PROTOCOLO DO BRONZE
SILVER_BASE_PATH = f"abfss://{SILVER_CONTAINER}@{TGT_STORAGE_ACCOUNT}.dfs.core.windows.net/cnpj"
CONTROL_TABLE_PATH = f"abfss://{SILVER_CONTAINER}@{TGT_STORAGE_ACCOUNT}.dfs.core.windows.net/metadata/silver_control_cnpj"

print(f"‚úÖ Storage Account: {TGT_STORAGE_ACCOUNT}")
print(f"‚úÖ Silver Container: {SILVER_CONTAINER}")
print(f"‚úÖ Protocolo: abfss:// (ADLS Gen2)")
print(f"üìÇ Silver Base Path: {SILVER_BASE_PATH}")
print(f"üìÇ Control Table Path: {CONTROL_TABLE_PATH}")
print(f"üóÑÔ∏è Bronze: Leitura via Hive Metastore (bronze)")

# COMMAND ----------

# üß™ VALIDAR ACESSO ao container Silver
print("\nüß™ Validando acesso ao container silver...")

try:
    test_path = f"abfss://{SILVER_CONTAINER}@{TGT_STORAGE_ACCOUNT}.dfs.core.windows.net/"
    files = dbutils.fs.ls(test_path)
    print(f"‚úÖ Container 'silver' acess√≠vel - {len(files)} itens encontrados")
    
except Exception as e:
    print(f"‚ùå ERRO: Cluster n√£o tem acesso ao container 'silver'")
    print(f"\n‚ö†Ô∏è Verifique:")
    print(f"   1. Container 'silver' existe no Storage Account")
    print(f"   2. Service Principal/Managed Identity tem permiss√µes")
    print(f"   3. Role necess√°ria: 'Storage Blob Data Contributor'")
    print(f"\nErro: {str(e)[:300]}")
    raise

In [0]:
# MAGIC ## 3. Otimiza√ß√µes Spark

# COMMAND ----------

spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")

In [0]:
# MAGIC ## 4. Cria√ß√£o de Databases

# COMMAND ----------

spark.sql("CREATE DATABASE IF NOT EXISTS silver")
spark.sql("CREATE DATABASE IF NOT EXISTS metadata")

print("‚úÖ Databases criados no Hive Metastore")

In [0]:
# MAGIC ## 5. Cria√ß√£o da Tabela de Controle

# COMMAND ----------

spark.sql(f"""
    CREATE TABLE IF NOT EXISTS metadata.silver_control_cnpj (
        table_name STRING COMMENT 'Nome da tabela Silver',
        last_ingestion_timestamp TIMESTAMP COMMENT '√öltimo timestamp processado',
        last_update TIMESTAMP COMMENT 'Timestamp da √∫ltima atualiza√ß√£o',
        records_processed BIGINT COMMENT 'Total de registros processados',
        execution_id STRING COMMENT 'ID da execu√ß√£o'
    )
    USING DELTA
    LOCATION '{CONTROL_TABLE_PATH}'
    COMMENT 'Controle incremental - Silver CNPJ'
""")

print("‚úÖ Tabela de controle criada")

# Verificar location
location = spark.sql("DESCRIBE DETAIL metadata.silver_control_cnpj").select("location").first()[0]
print(f"üìÇ Location: {location}")

In [0]:
# MAGIC ## 6. Configura√ß√£o de Transforma√ß√µes

# COMMAND ----------

table_config = [
    # ========================
    # CNAE
    # ========================
    {
        "bronze_table": "cnaes",
        "silver_table": "cnaes",
        "key_columns": ["codigo_cnae"],
        "column_mapping": {
            "codigo_cnae": {"rename": "codigo_cnae", "cast": "string", "trim": False},
            "descricao_cnae": {"rename": "descricao_cnae", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Classifica√ß√£o Nacional de Atividades Econ√¥micas (CNAE)"
    },

    # ========================
    # EMPRESAS
    # ========================
    {
        "bronze_table": "empresas",
        "silver_table": "empresas",
        "key_columns": ["cnpj_basico"],
        "column_mapping": {
            "cnpj_basico": {"rename": "cnpj_basico", "cast": "string", "trim": False},
            "razao_social": {"rename": "razao_social", "cast": "string", "trim": True},
            "natureza_juridica": {"rename": "natureza_juridica", "cast": "string", "trim": False},
            "qualificacao_responsavel": {"rename": "qualificacao_responsavel", "cast": "string", "trim": False},
            "capital_social": {"rename": "capital_social", "cast": "decimal(18,2)", "trim": False},
            "porte_empresa": {"rename": "porte_empresa", "cast": "string", "trim": False},
            "ente_federativo_responsavel": {"rename": "ente_federativo_responsavel", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Dados cadastrais das empresas (n√≠vel CNPJ b√°sico)"
    },

    # ========================
    # ESTABELECIMENTOS
    # ========================
    {
        "bronze_table": "estabelecimentos",
        "silver_table": "estabelecimentos",
        "key_columns": ["cnpj_basico", "cnpj_ordem", "cnpj_dv"],
        "column_mapping": {
            "cnpj_basico": {"rename": "cnpj_basico", "cast": "string", "trim": False},
            "cnpj_ordem": {"rename": "cnpj_ordem", "cast": "string", "trim": False},
            "cnpj_dv": {"rename": "cnpj_dv", "cast": "string", "trim": False},
            "identificador_matriz_filial": {"rename": "identificador_matriz_filial", "cast": "string", "trim": False},
            "nome_fantasia": {"rename": "nome_fantasia", "cast": "string", "trim": True},
            "situacao_cadastral": {"rename": "situacao_cadastral", "cast": "string", "trim": False},
            "data_situacao_cadastral": {"rename": "data_situacao_cadastral", "cast": "date", "trim": False},
            "motivo_situacao_cadastral": {"rename": "motivo_situacao_cadastral", "cast": "int", "trim": False},
            "nome_cidade_exterior": {"rename": "nome_cidade_exterior", "cast": "string", "trim": True},
            "pais": {"rename": "pais", "cast": "int", "trim": False},
            "data_inicio_atividade": {"rename": "data_inicio_atividade", "cast": "date", "trim": False},
            "cnae_fiscal_principal": {"rename": "cnae_fiscal_principal", "cast": "string", "trim": False},
            "cnae_fiscal_secundaria": {"rename": "cnae_fiscal_secundaria", "cast": "string", "trim": False},
            "tipo_logradouro": {"rename": "tipo_logradouro", "cast": "string", "trim": True},
            "logradouro": {"rename": "logradouro", "cast": "string", "trim": True},
            "numero": {"rename": "numero", "cast": "string", "trim": True},
            "complemento": {"rename": "complemento", "cast": "string", "trim": True},
            "bairro": {"rename": "bairro", "cast": "string", "trim": True},
            "cep": {"rename": "cep", "cast": "string", "trim": False},
            "uf": {"rename": "uf", "cast": "string", "trim": False},
            "municipio": {"rename": "municipio", "cast": "string", "trim": False},
            "ddd_1": {"rename": "ddd1", "cast": "string", "trim": False},
            "telefone_1": {"rename": "telefone1", "cast": "string", "trim": False},
            "ddd_2": {"rename": "ddd2", "cast": "string", "trim": False},
            "telefone_2": {"rename": "telefone2", "cast": "string", "trim": False},
            "ddd_fax": {"rename": "ddd_fax", "cast": "string", "trim": False},
            "fax": {"rename": "fax", "cast": "string", "trim": False},
            "email": {"rename": "email", "cast": "string", "trim": True},
            "situacao_especial": {"rename": "situacao_especial", "cast": "string", "trim": True},
            "data_situacao_especial": {"rename": "data_situacao_especial", "cast": "date", "trim": False}
        },
        "partition_by": ["uf"],
        "description": "Estabelecimentos vinculados ao CNPJ (matriz e filiais)"
    },

    # ========================
    # MOTIVOS
    # ========================
    {
        "bronze_table": "motivos",
        "silver_table": "motivos",
        "key_columns": ["codigo_motivo"],
        "column_mapping": {
            "codigo_motivo": {"rename": "codigo_motivo", "cast": "string", "trim": False},
            "descricao_motivo": {"rename": "descricao_motivo", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Motivos de situa√ß√£o cadastral"
    },

    # ========================
    # MUNIC√çPIOS
    # ========================
    {
        "bronze_table": "municipios",
        "silver_table": "municipios",
        "key_columns": ["codigo_municipio"],
        "column_mapping": {
            "codigo_municipio": {"rename": "codigo_municipio", "cast": "string", "trim": False},
            "descricao_municipio": {"rename": "descricao_municipio", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Cadastro de munic√≠pios do Brasil"
    },

    # ========================
    # NATUREZAS JUR√çDICAS
    # ========================
    {
        "bronze_table": "naturezas",
        "silver_table": "naturezas",
        "key_columns": ["codigo_natureza_juridica"],
        "column_mapping": {
            "codigo_natureza_juridica": {"rename": "codigo_natureza_juridica", "cast": "string", "trim": False},
            "descricao_natureza_juridica": {"rename": "descricao_natureza_juridica", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Naturezas jur√≠dicas das empresas"
    },

    # ========================
    # PA√çSES
    # ========================
    {
        "bronze_table": "paises",
        "silver_table": "paises",
        "key_columns": ["codigo_pais"],
        "column_mapping": {
            "codigo_pais": {"rename": "codigo_pais", "cast":"string","trim": False},
            "descricao_pais": {"rename": "descricao_pais", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Cadastro de pa√≠ses"
    },

    # ========================
    # QUALIFICA√á√ïES
    # ========================
    {
        "bronze_table": "qualificacoes",
        "silver_table": "qualificacoes",
        "key_columns": ["codigo_qualificacao"],
        "column_mapping": {
            "codigo_qualificacao": {"rename": "codigo_qualificacao", "cast": "string", "trim": False},
            "descricao_qualificacao": {"rename": "descricao_qualificacao", "cast": "string", "trim": True}
        },
        "partition_by": None,
        "description": "Qualifica√ß√µes de s√≥cios e respons√°veis legais"
    },

    # ========================
    # SIMPLES / MEI
    # ========================
    {
        "bronze_table": "simples",
        "silver_table": "simples",
        "key_columns": ["cnpj_basico"],
        "column_mapping": {
            "cnpj_basico": {"rename": "cnpj_basico", "cast": "string", "trim": False},
            "opcao_simples": {"rename": "opcao_simples", "cast": "string", "trim": False},
            "data_opcao_simples": {"rename": "data_opcao_simples", "cast": "date", "trim": False},
            "data_exclusao_simples": {"rename": "data_exclusao_simples", "cast": "date", "trim": False},
            "opcao_mei": {"rename": "opcao_mei", "cast": "string", "trim": False},
            "data_opcao_mei": {"rename": "data_opcao_mei", "cast": "date", "trim": False},
            "data_exclusao_mei": {"rename": "data_exclusao_mei", "cast": "date", "trim": False}
        },
        "partition_by": None,
        "description": "Op√ß√£o pelo Simples Nacional e MEI"
    },

    # ========================
    # S√ìCIOS
    # ========================
    {
        "bronze_table": "socios",
        "silver_table": "socios",
        "key_columns": ["cnpj_basico", "cpf_cnpj_socio"],
        "column_mapping": {
            "cnpj_basico": {"rename": "cnpj_basico", "cast": "string", "trim": False},
            "identificador_socio": {"rename": "identificador_socio", "cast": "string", "trim": False},
            "nome_socio_razao_social": {"rename": "nome_socio_razao_social", "cast": "string", "trim": True},
            "cpf_cnpj_socio": {"rename": "cpf_cnpj_socio", "cast": "string", "trim": False},
            "qualificacao_socio": {"rename": "qualificacao_socio", "cast": "string", "trim": False},
            "data_entrada_sociedade": {"rename": "data_entrada_sociedade", "cast": "date", "trim": False},
            "pais": {"rename": "pais", "cast": "string", "trim": False},
            "cpf_representante_legal": {"rename": "cpf_representante_legal", "cast": "string", "trim": True},
            "nome_representante_legal": {"rename": "nome_representante_legal", "cast": "string", "trim": True},
            "qualificacao_representante_legal": {"rename": "qualificacao_representante_legal", "cast": "string", "trim": False},
            "faixa_etaria": {"rename": "faixa_etaria", "cast": "int", "trim": False}
        },
        "partition_by": None,
        "description": "Quadro societ√°rio das empresas"
    }
]

print(f"‚úÖ {len(table_config)} tabelas configuradas")

In [0]:
# MAGIC ## 7. Fun√ß√µes de Transforma√ß√£o

# COMMAND ----------

def get_last_processed_timestamp(silver_table_name: str) -> datetime:
    """
    Recupera √∫ltimo timestamp processado da tabela de controle
    
    Args:
        silver_table_name: Nome da tabela Silver
        
    Returns:
        √öltimo timestamp processado ou None se primeira execu√ß√£o
    """
    try:
        row = (
            spark.table("metadata.silver_control_cnpj")
            .filter(F.col("table_name") == silver_table_name)
            .select("last_ingestion_timestamp")
            .orderBy(F.col("last_update").desc())
            .limit(1)
            .collect()
        )
        
        if row:
            timestamp = row[0]["last_ingestion_timestamp"]
            print(f"üìÖ √öltimo processamento: {timestamp}")
            return timestamp
        else:
            print(f"üÜï Primeira execu√ß√£o")
            return None
            
    except Exception as e:
        print(f"‚ö†Ô∏è Controle vazio: {e}")
        return None


def get_bronze_incremental_latest(bronze_table: str, pk_columns: list, last_processed_at: datetime):
    """
    L√™ dados incrementais do Bronze via HIVE METASTORE
    
    Args:
        bronze_table: Nome da tabela Bronze (sem database)
        pk_columns: Lista de colunas da chave prim√°ria
        last_processed_at: Timestamp do √∫ltimo processamento (watermark)
        
    Returns:
        DataFrame com dados novos (DEDUPLICA√á√ÉO SER√Å FEITA DEPOIS DA TRANSFORMA√á√ÉO)
    """
    
    # ‚úÖ Leitura via Hive Metastore
    full_table_name = f"bronze.{bronze_table}"
    
    try:
        df_bronze = spark.table(full_table_name)
    except Exception as e:
        print(f"‚ùå Tabela Bronze n√£o existe: {full_table_name}")
        raise
    
    # Filtro incremental usando _ingestion_timestamp (nome correto do Bronze)
    if last_processed_at:
        df_filtered = df_bronze.filter(F.col("_ingestion_timestamp") > F.lit(last_processed_at))
        records_filtered = df_filtered.count()
        print(f"üìä Registros incrementais: {records_filtered:,}")
    else:
        df_filtered = df_bronze
        records_total = df_filtered.count()
        print(f"üìä Carga FULL: {records_total:,}")
    
    # ‚úÖ RETORNA SEM DEDUPLICA√á√ÉO
    # A deduplica√ß√£o ser√° feita DEPOIS da transforma√ß√£o de colunas
    # para evitar conflito entre nomes Bronze e Silver
    
    return df_filtered


def apply_column_mapping(df, column_mapping: dict):
    """
    Aplica transforma√ß√µes de colunas (cast, rename, trim)
    
    Args:
        df: DataFrame Bronze
        column_mapping: Dicion√°rio com regras de transforma√ß√£o
        
    Returns:
        DataFrame transformado
    """
    select_exprs = []
    
    for source_col, rules in column_mapping.items():
        if source_col not in df.columns:
            print(f"‚ö†Ô∏è Coluna {source_col} n√£o encontrada - ignorando")
            continue
        
        # Suporte para mapeamento simples (string)
        if isinstance(rules, str):
            select_exprs.append(F.col(source_col).alias(rules))
            continue
        
        # Mapeamento completo (dict)
        target_col = rules.get("rename", source_col)
        cast_type = rules.get("cast")
        trim_flag = rules.get("trim", False)
        
        col_expr = F.col(source_col)
        
        # Aplicar trim se string
        if trim_flag:
            col_expr = F.trim(col_expr)
        
        # ‚úÖ TRATAMENTO ESPECIAL PARA DATAS (formato AAAAMMDD)
        if cast_type == "date":
            # Converter string AAAAMMDD para DATE
            # Exemplo: "20210315" -> 2021-03-15
            col_expr = F.to_date(col_expr, "yyyyMMdd")
        elif cast_type:
            # Aplicar cast normal para outros tipos
            col_expr = col_expr.cast(cast_type)
        
        select_exprs.append(col_expr.alias(target_col))
    
    # Preservar metadados t√©cnicos (NOMES CORRETOS DO BRONZE)
    technical_cols = ["ingestion_date", "_ingestion_timestamp", "_source_path"]
    for col_name in technical_cols:
        if col_name in df.columns:
            select_exprs.append(F.col(col_name))
    
    # ‚úÖ ADICIONAR TIMESTAMP DE INSER√á√ÉO NA SILVER
    select_exprs.append(F.current_timestamp().alias("_silver_timestamp"))
    
    return df.select(*select_exprs)


def deduplicate_by_pk(df, pk_columns: list):
    """
    Deduplica DataFrame por chave prim√°ria (mant√©m registro mais recente)
    
    Args:
        df: DataFrame j√° transformado (com nomes Silver)
        pk_columns: Lista de colunas da chave prim√°ria (nomes Silver)
        
    Returns:
        DataFrame deduplicado
    """
    
    # Window por PK, ordenado por timestamp descendente
    window_pk = Window.partitionBy(*pk_columns).orderBy(F.col("_ingestion_timestamp").desc())
    
    df_dedup = (
        df
        .withColumn("_rn", F.row_number().over(window_pk))
        .filter(F.col("_rn") == 1)
        .drop("_rn")
    )
    
    records_dedup = df_dedup.count()
    print(f"‚úÖ Registros √∫nicos ap√≥s deduplica√ß√£o: {records_dedup:,}")
    
    return df_dedup


def merge_into_silver(df_source, silver_table_name: str, pk_columns: list, partition_by: list):
    """
    Executa MERGE na Silver e registra no Hive Metastore
    
    Args:
        df_source: DataFrame transformado
        silver_table_name: Nome da tabela Silver (ex: 'silver_empresas')
        pk_columns: Colunas da chave prim√°ria
        partition_by: Colunas para particionamento (ou None)
    """
    
    # Path f√≠sico no ADLS Gen2
    silver_path = f"{SILVER_BASE_PATH}/{silver_table_name}"
    full_table_name = f"silver.{silver_table_name}"
    
    # Verificar se tabela existe
    if not DeltaTable.isDeltaTable(spark, silver_path):
        print(f"üÜï Criando tabela: {full_table_name}")
        
        # Escrever arquivos Delta
        writer = df_source.write.format("delta").mode("overwrite")
        
        if partition_by:
            writer = writer.partitionBy(*partition_by)
            print(f"üìÇ Particionamento: {partition_by}")
        
        writer.save(silver_path)
        
        # ‚úÖ Registrar no Hive Metastore
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {full_table_name}
            USING DELTA
            LOCATION '{silver_path}'
        """)
        
        print(f"‚úÖ Tabela criada e registrada no Hive")
        return
    
    # MERGE incremental (SCD Type 1)
    print(f"üîÑ Executando MERGE: {full_table_name}")
    
    delta_silver = DeltaTable.forPath(spark, silver_path)
    merge_condition = " AND ".join([f"silver.{pk} = source.{pk}" for pk in pk_columns])
    
    # Colunas para update (todas exceto PKs)
    update_columns = {col: f"source.{col}" for col in df_source.columns if col not in pk_columns}
    
    # Colunas para insert (todas)
    insert_columns = {col: f"source.{col}" for col in df_source.columns}
    
    (
        delta_silver.alias("silver")
        .merge(df_source.alias("source"), merge_condition)
        .whenMatchedUpdate(set=update_columns)
        .whenNotMatchedInsert(values=insert_columns)
        .execute()
    )
    
    print(f"‚úÖ MERGE conclu√≠do")


def update_control_table(silver_table_name: str, max_timestamp: datetime, records_count: int, execution_id: str):
    """
    Atualiza tabela de controle com watermark
    
    Args:
        silver_table_name: Nome da tabela Silver
        max_timestamp: Timestamp m√°ximo processado
        records_count: Quantidade de registros processados
        execution_id: ID da execu√ß√£o
    """
    
    # ‚úÖ CALCULAR timestamp atual ANTES de criar tupla
    current_ts = datetime.now()
    
    control_data = [(silver_table_name, max_timestamp, current_ts, records_count, execution_id)]
    df_new = spark.createDataFrame(
        control_data,
        ["table_name", "last_ingestion_timestamp", "last_update", "records_processed", "execution_id"]
    )
    
    if DeltaTable.isDeltaTable(spark, CONTROL_TABLE_PATH):
        delta_control = DeltaTable.forPath(spark, CONTROL_TABLE_PATH)
        
        (
            delta_control.alias("t")
            .merge(df_new.alias("s"), "t.table_name = s.table_name")
            .whenMatchedUpdate(set={
                "last_ingestion_timestamp": "s.last_ingestion_timestamp",
                "last_update": "s.last_update",
                "records_processed": "t.records_processed + s.records_processed",
                "execution_id": "s.execution_id"
            })
            .whenNotMatchedInsert(values={
                "table_name": "s.table_name",
                "last_ingestion_timestamp": "s.last_ingestion_timestamp",
                "last_update": "s.last_update",
                "records_processed": "s.records_processed",
                "execution_id": "s.execution_id"
            })
            .execute()
        )
        
        print(f"üìù Controle atualizado")
    else:
        df_new.write.format("delta").mode("overwrite").save(CONTROL_TABLE_PATH)
        print(f"üÜï Controle inicializado")

In [0]:
# MAGIC ## 8. Processamento Incremental

# COMMAND ----------

execution_id = datetime.now().strftime("%Y%m%d_%H%M%S")
print(f"üöÄ Execution ID: {execution_id}")
print(f"{'='*80}\n")

# Estat√≠sticas da execu√ß√£o
total_tables_processed = 0
total_records_processed = 0
tables_created = []
tables_updated = []
tables_failed = []

for config in table_config:
    
    bronze_table = config["bronze_table"]
    silver_table = config["silver_table"]
    pk_cols = config["key_columns"]
    column_mapping = config["column_mapping"]
    partition_by = config["partition_by"]
    description = config["description"]
    
    print(f"\n{'='*80}")
    print(f"üìã Bronze: bronze.{bronze_table}")
    print(f"üìã Silver: silver.{silver_table}")
    print(f"üìù Descri√ß√£o: {description}")
    print(f"{'='*80}")
    
    try:
        # 1. Verificar se Bronze existe
        try:
            spark.table(f"bronze.{bronze_table}").limit(1).count()
        except:
            print(f"‚ö†Ô∏è Tabela Bronze n√£o existe: bronze.{bronze_table}")
            tables_failed.append(silver_table)
            continue
        
        # 2. Recuperar √∫ltimo timestamp processado
        last_processed_at = get_last_processed_timestamp(silver_table)
        
        # 3. Ler dados incrementais do Bronze (SEM deduplica√ß√£o ainda)
        df_latest = get_bronze_incremental_latest(bronze_table, pk_cols, last_processed_at)
        
        # 4. Verificar se h√° dados novos
        if df_latest.count() == 0:
            print(f"‚ÑπÔ∏è Sem dados novos para processar")
            continue
        
        # 5. Aplicar transforma√ß√µes (rename, cast, trim)
        df_transformed = apply_column_mapping(df_latest, column_mapping)
        
        # 6. ‚úÖ DEDUPLICAR AP√ìS TRANSFORMA√á√ÉO (com nomes Silver corretos)
        df_transformed = deduplicate_by_pk(df_transformed, pk_cols)
        
        # 7. Verificar se √© tabela nova
        is_new_table = not spark.catalog.tableExists(f"silver.{silver_table}")
        
        # 8. MERGE na Silver
        merge_into_silver(df_transformed, silver_table, pk_cols, partition_by)
        
        # 9. Atualizar controle
        max_timestamp = df_transformed.agg(F.max("_ingestion_timestamp")).collect()[0][0]
        records_count = df_transformed.count()
        
        update_control_table(silver_table, max_timestamp, records_count, execution_id)
        
        # 10. Registrar estat√≠sticas
        total_tables_processed += 1
        total_records_processed += records_count
        
        if is_new_table:
            tables_created.append(silver_table)
        else:
            tables_updated.append(silver_table)
        
        print(f"‚úÖ {silver_table} processado com sucesso!")
        
    except Exception as e:
        print(f"‚ùå ERRO ao processar {silver_table}")
        print(f"   Erro: {str(e)}")
        tables_failed.append(silver_table)
        
        import traceback
        traceback.print_exc()

print(f"\n{'='*80}")
print(f"üéâ PROCESSAMENTO CONCLU√çDO")
print(f"{'='*80}")

In [0]:
# MAGIC ## 9. Relat√≥rio de Execu√ß√£o

# COMMAND ----------

print(f"""
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë                  RELAT√ìRIO - SILVER CNPJ                         ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù

üìä ESTAT√çSTICAS GERAIS:
   ‚Ä¢ Execution ID: {execution_id}
   ‚Ä¢ Tabelas processadas: {total_tables_processed}
   ‚Ä¢ Registros processados: {total_records_processed:,}
   ‚Ä¢ Protocolo: abfss:// (ADLS Gen2)

‚úÖ TABELAS CRIADAS ({len(tables_created)}):
{chr(10).join(f'   ‚Ä¢ {t}' for t in tables_created) if tables_created else '   (nenhuma)'}

üîÑ TABELAS ATUALIZADAS ({len(tables_updated)}):
{chr(10).join(f'   ‚Ä¢ {t}' for t in tables_updated) if tables_updated else '   (nenhuma)'}

{f'‚ùå TABELAS COM FALHA ({len(tables_failed)}):' if tables_failed else ''}
{chr(10).join(f'   ‚Ä¢ {t}' for t in tables_failed) if tables_failed else ''}

üóÑÔ∏è ARQUITETURA:
   ‚Ä¢ Bronze (leitura): Hive Metastore ‚Üí bronze_cnpj.*
   ‚Ä¢ Silver (escrita): ADLS Gen2 + Hive ‚Üí silver_cnpj.*
   ‚Ä¢ Controle: metadata.silver_control_cnpj

üìÇ LOCALIZA√á√ÉO F√çSICA:
   ‚Ä¢ Silver Tables: {SILVER_BASE_PATH}
   ‚Ä¢ Control Table: {CONTROL_TABLE_PATH}

üîó CONEX√ÉO POWER BI:
   ‚Ä¢ Catalog: hive_metastore
   ‚Ä¢ Schema: silver
   ‚Ä¢ Autentica√ß√£o: Configurada no cluster (Service Principal/Managed Identity)

‚è±Ô∏è PR√ìXIMA EXECU√á√ÉO:
   ‚Ä¢ Apenas dados com _ingestion_timestamp > √∫ltimo processamento
   ‚Ä¢ Processamento incremental autom√°tico
""")

In [0]:
%sql
DROP TABLE IF EXISTS silver.corr_ncm_cnae;

CREATE TABLE silver.corr_ncm_cnae
USING DELTA
LOCATION 'abfss://silver@aceleragrupo5sa.dfs.core.windows.net/corr_ncm_cnae'
AS SELECT * FROM bronze.corr_ncm_cnae