In [20]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, upper , to_date
from pyspark.sql.types import DateType
from dotenv import load_dotenv

# Carregar variáveis de ambiente do caminho dentro do container
load_dotenv('../.env_kafka_connect')

# Config AWS e Postgres (variáveis de ambiente)
aws_access_key = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
aws_region = "us-east-1"

pg_host = os.getenv("POSTGRES_HOST")
pg_port = os.getenv("POSTGRES_PORT", "5432")
pg_db = os.getenv("POSTGRES_DB")
pg_user = os.getenv("POSTGRES_USER")
pg_password = os.getenv("POSTGRES_PASSWORD")

# JDBC URL e propriedades de conexão
#jdbc_url = f"jdbc:postgresql://{pg_host}:{pg_port}/{pg_db}"
jdbc_url = f"jdbc:postgresql://host.docker.internal:5432/postgres"
jdbc_properties = {
    "user": pg_user,
    "password": pg_password,
    "driver": "org.postgresql.Driver"
}

print(jdbc_url)
print(jdbc_properties)


jdbc:postgresql://host.docker.internal:5432/postgres
{'user': 'postgres', 'password': 'postgres', 'driver': 'org.postgresql.Driver'}


In [21]:
# Caminho dos jars para Spark (Postgres + AWS)
hadoop_aws_jar = "../jars/hadoop-aws-3.3.4.jar"
aws_sdk_jar = "../jars/aws-java-sdk-bundle-1.12.262.jar"
postgres_jdbc_jar = "../jars/postgresql-42.6.2.jar"
jars_path = f"{hadoop_aws_jar},{aws_sdk_jar},{postgres_jdbc_jar}"

print(f"jars_path: {jars_path}")

# Criar SparkSession com configurações AWS + JDBC
spark = SparkSession.builder \
    .appName("Pipeline - Silver") \
    .config("spark.jars", jars_path) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.access.key", aws_access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", aws_secret_key) \
    .config("spark.hadoop.fs.s3a.endpoint", f"s3.{aws_region}.amazonaws.com") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "true") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .getOrCreate()

# Colunas obrigatórias para limpeza
colunas_obrigatorias = ["CompraManha", "VendaManha", "PUCompraManha", "PUVendaManha", "PUBaseManha", "Data_Vencimento", "Data_Base", "Tipo"]

jars_path: ../jars/hadoop-aws-3.3.4.jar,../jars/aws-java-sdk-bundle-1.12.262.jar,../jars/postgresql-42.6.2.jar


In [22]:
# Ler dados Bronze do Postgres
df_ipca = spark.read.jdbc(url=jdbc_url, table="public.dadostesouroipca", properties=jdbc_properties)
df_pre = spark.read.jdbc(url=jdbc_url, table="public.dadostesouropre", properties=jdbc_properties)

# Drop duplicatas
df_ipca = df_ipca.dropDuplicates()
df_pre = df_pre.dropDuplicates()

#Criando coluna 'Data_Base' a partir de 'dt_update'
df_ipca = df_ipca \
    .withColumn("Data_Base", to_date(col("dt_update"))) \
    .withColumn("Data_Vencimento", to_date(col("dt_update")))
df_pre = df_pre \
    .withColumn("Data_Base", to_date(col("dt_update"))) \
    .withColumn("Data_Vencimento", to_date(col("dt_update")))

# Drop nulos nas colunas importantes
df_ipca = df_ipca.dropna(subset=colunas_obrigatorias)
df_pre = df_pre.dropna(subset=colunas_obrigatorias)

# Ajuste de tipos para colunas de data
df_ipca = df_ipca.withColumn("Data_Base", col("Data_Base").cast("date"))
df_ipca = df_ipca.withColumn("Data_Vencimento", col("Data_Vencimento").cast("date"))
df_pre = df_pre.withColumn("Data_Base", col("Data_Base").cast("date"))
df_pre = df_pre.withColumn("Data_Vencimento", col("Data_Vencimento").cast("date"))

# Padronizar coluna Tipo para maiúsculo
df_ipca = df_ipca.withColumn("Tipo", upper(col("Tipo")))
df_pre = df_pre.withColumn("Tipo", upper(col("Tipo")))

# Gravar os dados tratados na camada Silver
df_ipca.write.jdbc(url=jdbc_url, table="public.dadostesouroipca_silver", mode="overwrite", properties=jdbc_properties)
df_pre.write.jdbc(url=jdbc_url, table="public.dadostesouropre_silver", mode="overwrite", properties=jdbc_properties)

print("Pipeline Silver executado com sucesso!")

Pipeline Silver executado com sucesso!
