In [10]:
import logging

# Cria um logger com o nome 'jupyter_logger'
logger = logging.getLogger('jupyter_logger')

# Define o nível de log para DEBUG, então todas as mensagens de log serão mostradas
logger.setLevel(logging.DEBUG)

# Cria um manipulador de log que escreve as mensagens de log na saída padrão
handler = logging.StreamHandler()

# Define o nível de log do manipulador para DEBUG
handler.setLevel(logging.DEBUG)

# Cria um formatador de log que adiciona a data e hora à mensagem de log
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Adiciona o formatador ao manipulador
handler.setFormatter(formatter)

# Adiciona o manipulador ao logger
logger.addHandler(handler)

In [11]:
from pyspark.sql import SparkSession
from delta.tables import DeltaTable
from pyspark.sql.functions import col


# "file:////home/user/to_process/006790cd-5c1b-47b8-9666-045f9dad6846.json"
# Crie uma sessão Spark

spark = SparkSession.builder \
    .appName("Merge with Delta Lake") \
    .config("spark.task.maxFailures", "1") \
    .config("fs.s3a.endpoint", "http://minio:9000") \
    .config("fs.s3a.access.key", "TJeJFQPmgzTkMU2j") \
    .config("fs.s3a.secret.key", "XgRcjxglL8EKI2NuTkNNsw106Hm48mVF") \
    .config("fs.s3a.path.style.access", "true") \
    .config("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.ssl.enabled", "false") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

def executing_merge(raw_bucket,trust_bucket, spark=spark):
    
    
    # raw_bucket = "s3a://raw/user"

    newData = spark.read.json(f"{raw_bucket}/*")

    # Verifique se a tabela Delta existe trusted
    path = trust_bucket #"s3a://trusted/user/"
    
    if DeltaTable.isDeltaTable(spark, path):
        deltaTable = DeltaTable.forPath(spark, path)
    else:
        # Se a tabela Delta não existir, crie-a
        selectedData = newData.select(col('data.id').alias('id'),col('data.first_name').alias('first_name'),col('data.last_name').alias('last_name'),col('data.email').alias('email'),col('data.date_of_birth').alias('date_of_birth'))
        selectedData.write.format("delta").save(path)
        deltaTable = DeltaTable.forPath(spark, path)
    # newData.printSchema()
    # deltaTable.printSchema()
    # Realize a operação de merge
    logger.debug("Executing merge")
    deltaTable.alias("oldData") \
        .merge(
            newData.alias("newData"),
            "oldData.id = newData.data.id") \
        .whenMatchedUpdate(set = { 
            "first_name" : "newData.data.first_name",
            "last_name" : "newData.data.last_name",
            "email" : "newData.data.email",
            "date_of_birth" : "newData.data.date_of_birth" }) \
        .whenNotMatchedInsert(values = { 
            "id": "newData.data.id",
            "first_name" : "newData.data.first_name",
            "last_name" : "newData.data.last_name",
            "email" : "newData.data.email",
            "date_of_birth" : "newData.data.date_of_birth" }) \
        .execute()
    
#     

In [12]:
raw_bucket="s3a://raw/user"
trust_bucket="s3a://trusted/user"

In [13]:
from pyspark import SparkContext

sc = SparkContext.getOrCreate()



files_raw_before_merge = sc.wholeTextFiles(raw_bucket)
file_count_raw_before_merge = files_raw_before_merge.count()
    
files_trusted_before_merge = sc.wholeTextFiles(trust_bucket)
file_count_trusted_before_merge = files_trusted_before_merge.count()
    
logger.debug(f"Before merge in raw bucket has {file_count_raw_before_merge} and trust bucket has {file_count_trusted_before_merge}")


2024-05-20 02:24:56,366 - jupyter_logger - DEBUG - Before merge in raw bucket has 38 and trust bucket has 15
2024-05-20 02:24:56,366 - jupyter_logger - DEBUG - Before merge in raw bucket has 38 and trust bucket has 15


In [14]:
executing_merge(raw_bucket=raw_bucket,trust_bucket=trust_bucket)

2024-05-20 02:24:57,230 - jupyter_logger - DEBUG - Executing merge
2024-05-20 02:24:57,230 - jupyter_logger - DEBUG - Executing merge


In [15]:
files_trusted_after_merge = sc.wholeTextFiles(trust_bucket)
file_count_trusted_after_merge = files_trusted_after_merge.count()
    
logger.debug(f"After merge in raw bucket has {file_count_raw_before_merge} and trust bucket has {file_count_trusted_after_merge}")

2024-05-20 02:24:58,766 - jupyter_logger - DEBUG - After merge in raw bucket has 38 and trust bucket has 16
2024-05-20 02:24:58,766 - jupyter_logger - DEBUG - After merge in raw bucket has 38 and trust bucket has 16
