In [0]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
 
spark = SparkSession.builder \
    .appName("IngestaoDimBalance") \
    .getOrCreate()

In [0]:
%run ./Intancia_Containers

In [0]:
from pyspark.sql.functions import max as spark_max, col

# Verifica ultima atualização da tabela bronze
df_last_ingestion = (
    spark.read.format("delta")
        .load(CONTROL_TABLE_PATH)
        .filter(col("originator") == "BALANCE")
        .groupBy("table_name")
        .agg(
            spark_max("last_ingestion_timestamp")
                .alias("last_ingestion_timestamp")
        )
)

# Join com tabelas da origem
df_inventario_filtrado = (
    df_inventario
        .filter(col("container_name") == "BALANCE")
        # REMOVE FACT (EXP / IMP → Auto Loader)
        .filter(
            ~(
                col("table_name").startswith("EXP_") |
                col("table_name").startswith("IMP_")
            )
        )
        .join(
            df_last_ingestion,
            on="table_name",
            how="left"
        )
        .filter(
            col("last_ingestion_timestamp").isNull() |
            (col("modificationTime") > col("last_ingestion_timestamp"))
        )
        .drop("last_ingestion_timestamp")
)

display(df_inventario_filtrado)

In [0]:
from pyspark.sql.functions import current_timestamp, current_date, lit
from delta.tables import DeltaTable
import re

# iterando nos Arquivos do Container Balance
for row in df_inventario_filtrado.collect():
    path_origem = row['path']
    container = row['container_name']
    nome_arquivo = row['table_name']
    
    # Define o caminho de destino (removendo a extensão para o nome da tabela Delta)
    tabela_nome = nome_arquivo.split('.')[0]    
    print(f"⏳ Processando: {tabela_nome}")

    tabela_destino = tabela_nome.lower()
    
    # Definindo o caminho de destino no LakeHouse
    destino_tabela = f"{BRONZE_BASE_PATH}/balancacomercial/{tabela_destino}"

    try:
        if nome_arquivo.endswith('.csv'):
            # Lendo arquivo bruto
            df_temp = spark.read.format("csv") \
                .option("header", "true") \
                .option("sep", ";") \
                .option("encoding", "ISO-8859-1") \
                .load(path_origem)
            
            # Convertendo todos os campos para string
            df_temp = df_temp.select(
                [F.col(c).cast("string").alias(c) for c in df_temp.columns]
            )
            
            # Adicionando metadados para qualidade dos dados
            df_temp = (
                df_temp
                .withColumn("_ingestion_date", F.current_date())
                .withColumn("_ingestion_timestamp", F.current_timestamp())
                .withColumn("_source_path", lit(path_origem))
            )

            print(f"Gravando FACT {tabela_nome} (append | partitionBy Ingestion_Date)")

            df_temp.write \
                .format("delta") \
                .mode("append") \
                .partitionBy("_ingestion_date") \
                .option("mergeSchema", "true") \
                .save(destino_tabela)

            # Atualizando tabela de controle
            dt = DeltaTable.forPath(spark, CONTROL_TABLE_PATH)        
            
            controle_df = spark.createDataFrame(
                [("BALANCE", tabela_destino, path_origem)],
                ["originator", "table_name", "input_file_name"]
            ).withColumn(
                "last_ingestion_timestamp", current_timestamp()
            )

            dt.alias("target").merge(
                controle_df.alias("source"),
                "target.originator = source.originator AND target.table_name = source.table_name"
            ).whenMatchedUpdateAll() \
            .whenNotMatchedInsertAll() \
            .execute()

        else:
            print(f"❌ Arquivo {nome_arquivo} no {container} BALANCE não é CSV")

    except Exception as e:
        print(f"❌ Erro ao processar {nome_arquivo}: {e}")