In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

In [0]:
df_posicao = spark.table("project_data_football_bronze.posicoes")

In [0]:
display(df_posicao)

In [0]:
%sql
describe project_data_football_bronze.posicoes

In [0]:
#definindo colunas
df_dim_posicao = df_posicao.select(
    col("posicao_id").cast("int"),
    col("nome"),
    col("abreviacao"),
    col("dt_ingestao").alias("dt_ultima_atualizacao")
)

In [0]:
# remover posicoes duplicadas, ordenando pelo timestamp mais recente, mantém apenas o registro mais novo e descarta os outros
window_spec = Window.partitionBy("posicao_id") \
                    .orderBy(col("dt_ultima_atualizacao").desc())

df_dim_posicao_tratado = (
    df_dim_posicao
        .withColumn("rn", row_number().over(window_spec))
        .filter("rn = 1")
        .drop("rn")
)

In [0]:
#salvando
df_dim_posicao_tratado.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("project_data_football_silver.dim_posicao")


In [0]:
%sql
select * from project_data_football_silver.dim_posicao

In [0]:
%sql
/* verificando se há duplicatas */
SELECT posicao_id, COUNT(*)
FROM project_data_football_silver.dim_posicao
GROUP BY posicao_id
HAVING COUNT(*) > 1;