In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, 
    to_date, 
    year, 
    lit,
    when, 
    trim, 
    regexp_replace, 
    initcap ,
    current_timestamp
)
from pyspark.sql.types import IntegerType, DoubleType, StringType, DateType

In [0]:
df_resultado = spark.sql("select * from nttdataeducacao.bronze.resultados_2024")

In [0]:
display(df_resultado.limit(5))

In [0]:
df_bronze = spark.table("nttdataeducacao.bronze.resultados_2024")

df_silver_step1 = df_bronze.select(
    col("NU_SEQUENCIAL").cast("long").alias("candidate_id"),
    col("NU_ANO").cast("int").alias("year"),
    when(col("CO_ESCOLA").isNull(), lit(-1)).otherwise(col("CO_ESCOLA").cast("long")).alias("school_id"),
    trim(col("NO_MUNICIPIO_ESC")).alias("school_city"),
    col("SG_UF_ESC").alias("school_state"),
    col("TP_DEPENDENCIA_ADM_ESC").cast("int").alias("school_type_id"),
    col("TP_LOCALIZACAO_ESC").cast("int").alias("school_location_id"),
    trim(col("NO_MUNICIPIO_PROVA")).alias("exam_city_raw"), 
    col("SG_UF_PROVA").alias("exam_state"),
    col("NU_NOTA_CN").cast("double").alias("score_nature"),
    col("NU_NOTA_CH").cast("double").alias("score_humanities"),
    col("NU_NOTA_LC").cast("double").alias("score_languages"),
    col("NU_NOTA_MT").cast("double").alias("score_math"),
    col("NU_NOTA_REDACAO").cast("double").alias("score_essay"),
    col("NU_NOTA_COMP1").cast("int").alias("essay_comp1"),
    col("NU_NOTA_COMP2").cast("int").alias("essay_comp2"),
    col("NU_NOTA_COMP3").cast("int").alias("essay_comp3"),
    col("NU_NOTA_COMP4").cast("int").alias("essay_comp4"),
    col("NU_NOTA_COMP5").cast("int").alias("essay_comp5"),
    col("TP_STATUS_REDACAO").cast("int").alias("essay_status_id"),
    col("TP_PRESENCA_CN").cast("int").alias("presence_nature"),
    col("TP_PRESENCA_CH").cast("int").alias("presence_humanities"),
    col("TP_PRESENCA_LC").cast("int").alias("presence_languages"),
    col("TP_PRESENCA_MT").cast("int").alias("presence_math"),
    col("TP_LINGUA").cast("int").alias("foreign_language_choice"),
    col("source_file").alias("origin_file"),
    col("ingestion_at").alias("bronze_ingested_at"),
    current_timestamp().alias("silver_processed_at")
)

display(df_silver_step1.limit(5))

In [0]:
from pyspark.sql.functions import col, when, lit, initcap

df_silver_step2 = df_silver_step1.withColumn("exam_city", initcap(col("exam_city_raw")))

df_final = df_silver_step2.withColumn(
    "score_total",
    when(col("score_essay").isNull(), 0).otherwise(col("score_essay")) +
    when(col("score_math").isNull(), 0).otherwise(col("score_math")) +
    when(col("score_languages").isNull(), 0).otherwise(col("score_languages")) +
    when(col("score_humanities").isNull(), 0).otherwise(col("score_humanities")) +
    when(col("score_nature").isNull(), 0).otherwise(col("score_nature"))
).withColumn(
    "is_present_all_exams",
    (col("presence_nature") == 1) & (col("presence_humanities") == 1) & 
    (col("presence_languages") == 1) & (col("presence_math") == 1)
)

display(df_final.select("candidate_id", "exam_city", "score_total", "is_present_all_exams").limit(10))

In [0]:
df_final.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .partitionBy("exam_state") \
    .saveAsTable("nttdataeducacao.silver.resultados_2024")

print("Tabela Silver salva com sucesso.")
display(df_final.limit(10))