In [1]:
from pyspark.sql import SparkSession
import logging
import pyspark.sql.functions as f
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
if __name__ == "__main__":

    logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO
    )
    log = logging.getLogger(__name__)   
    
    # Spark Session 
    spark = (
        SparkSession.builder.master("local[1]") 
        .appName("upload_to_trusted")
        .enableHiveSupport() 
        .config("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") 
        .getOrCreate()
    )

In [3]:
## Reading files 
log.info("Processing started...")

log.info(f"Reading bancos files...")

df_bancos = spark.read.parquet(f'code/atividade3/data/trusted/bancos/*.parquet')
#df_bancos.printSchema()

log.info(f"Reading glassdoor files...")

df_glassdoor = spark.read.parquet(f'code/atividade3/data/trusted/glassdoor/*.parquet')
#df_glassdoor.printSchema()

log.info(f"Reading reclamacoes files...")

df_reclamacoes = spark.read.parquet(f'code/atividade3/data/trusted/reclamacoes/*.parquet')
#df_reclamacoes.printSchema()


2025-08-11 21:04:59,557 - INFO - Processing started...
2025-08-11 21:04:59,561 - INFO - Reading bancos files...
2025-08-11 21:05:03,210 - INFO - Reading glassdoor files...
2025-08-11 21:05:03,528 - INFO - Reading reclamacoes files...


In [None]:
##Transforming 

log.info(f"Transforming trusted files...")

log.info(f"Joining dataframes...")

df_bancos_glassdoor = (
    df_glassdoor.alias("g")
    .join(
    df_bancos.alias("b"),
    (f.col("b.cnpj") == f.col("g.cnpj")) | (f.col("b.name") == f.col("g.name")),
    "inner",
    )
    .drop(f.col("b.cnpj"), f.col("b.name"),f.col("b.segment"))
    .dropDuplicates()
)

df_bancos_glassdoor_reclamacoes = (
    df_bancos_glassdoor.alias("bg")
    .join(
    df_reclamacoes.alias("r"),
    (f.col("bg.cnpj") == f.col("r.cnpj")) | (f.col("bg.name") == f.col("r.name")),
    "inner",
    )
    .drop(f.col("r.cnpj"), f.col("r.name"),f.col("r.segment"))
    .dropDuplicates()
)

In [None]:
##Writing output files

log.info(f"Writing delivery file...")

df_bancos_glassdoor_reclamacoes.coalesce(1) \
        .write \
        .format("parquet") \
        .mode("overwrite") \
        .save("../../data/delivery/")


