In [0]:
df_silver = spark.table("nttdataeducacao.bronze.enade_2024")

In [0]:
import re

def normalize_col(name):
    name = name.lower()
    name = re.sub(r'[^a-z0-9_]+', '_', name)
    name = re.sub(r'_+', '_', name)
    return name.strip('_')

df_silver = df_silver.toDF(*[normalize_col(c) for c in df_silver.columns])

In [0]:
from pyspark.sql import functions as F

colunas_numericas_str = ["nota_bruta_fg", "nota_padronizada_fg", "nota_bruta_ce", "nota_padronizada_ce", "conceito_enade_continuo"]

for c in colunas_numericas_str:
    df_silver = df_silver.withColumn(c, F.regexp_replace(F.col(c), ',', '.'))


In [0]:
for c in colunas_numericas_str:
    df_silver = df_silver.withColumn(c, F.col(c).cast("double"))

In [0]:
from pyspark.sql import functions as F


for c in df_silver.columns:
    df_silver = df_silver.withColumn(c, F.ltrim(F.col(c)))

In [0]:
from pyspark.sql import functions as F

for c in df_silver.columns:
    df_silver = df_silver.withColumn(c, F.lower(F.col(c)))

In [0]:
df_ordenado = df_silver.orderBy("area_de_avaliacao")

In [0]:
from pyspark.sql import functions as F

primeira = df_ordenado.columns[0]

df_ordenado = df_ordenado.withColumn(
    primeira,
    F.substring(F.col(primeira), 1, 30)  
)

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

w = Window.orderBy(F.monotonically_increasing_id())

df_idx = df_ordenado.withColumn("rn", F.row_number().over(w))

top3 = df_idx.filter("rn <= 3")
resto = df_idx.filter("rn > 3")

df_final = resto.unionByName(top3).drop("rn")

In [0]:
from pyspark.sql import functions as F

colunas_numericas = [
    "nota_bruta_fg",
    "nota_padronizada_fg",
    "nota_bruta_ce",
    "nota_padronizada_ce",
    "conceito_enade_continuo"
]

for c in colunas_numericas:
    df_final = df_final.withColumn(c, F.round(F.col(c), 2))

In [0]:
from pyspark.sql import functions as F

cols_int = [
    "ano",
    "codigo_da_area",
    "codigo_da_ies",
    "codigo_do_curso",
    "codigo_do_municipio",
    "n_de_concluintes_inscritos",
    "n_de_concluintes_participantes",
    "conceito_enade_faixa"
]

df_tmp = df_final

for c in cols_int:
    
    df_tmp = df_tmp.withColumn(c, F.regexp_replace(F.col(c), ",", "."))

    
    df_tmp = df_tmp.withColumn(c, F.expr(f"try_cast({c} as int)"))

df_final = df_tmp

In [0]:
from pyspark.sql import functions as F

df_final = df_final.withColumn(
    "ingestion_at",
    F.to_timestamp("ingestion_at")
)


In [0]:
df_final = df_final.withColumn(
    "sigla_da_uf",
    F.upper(F.trim(F.col("sigla_da_uf")))
)


In [0]:
from pyspark.sql import functions as F

df_final = df_final.withColumn(
    "source_file",
    F.regexp_extract(F.col("source_file"), r"([^/]+)$", 1)
)

df_final = df_final.withColumn(
    "source_file",
    F.lower(F.regexp_replace("source_file", r"\.[^.]+$", ""))
)


In [0]:
df_final.write.format("delta").mode("overwrite").saveAsTable("nttdataeducacao.silver.enade_2024")
display(df_final)
