In [0]:
%sql
select 
a.athlete_id,
a.sport_type,
b.calories,
a.achievement_count, 
a.pr_count,
a.has_heartrate,  
a.average_heartrate,
a.max_heartrate,
a.start_date,
a.start_date_local,
a.distance_km,
a.average_speed_kmh,
a.average_cadence,
a.total_elevation_gain,
a.tempo_real,
a.pace_strava,
a.dia_semana,
CASE
    WHEN hour(to_timestamp(start_date_local)) < 12 THEN 'manhã'
    WHEN hour(to_timestamp(start_date_local)) < 18 THEN 'tarde'
    ELSE 'noite'
  END AS periodo_dia,
weekofyear(to_timestamp(start_date_local)) AS semana,
year(to_timestamp(start_date_local)) AS ano,
CASE 
  WHEN calories >= 600 THEN 'alta'
  WHEN calories >= 300 THEN 'moderada'
  ELSE 'leve'
END AS intensidade_calorica,
CASE 
  WHEN (hour(tempo_real) * 60 + minute(tempo_real)) >= 60 THEN 'longa'
  WHEN (hour(tempo_real) * 60 + minute(tempo_real)) >= 30 THEN 'média'
  ELSE 'curta'
END AS duracao_tipo,

date_format(to_timestamp(start_date_local), 'MMM/yyyy') AS mes_ref

from uc_athlete_data.silver.strava_activities a
join uc_athlete_data.silver.sub_activity_final b
  on a.id = b.id
where start_date >= '2024-12-01'

In [0]:
_sqldf.createOrReplaceTempView("dados")

In [0]:
%sql
select count(*),sport_type, start_date
from dados
group by sport_type, start_date
having count(*) > 1


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

# Definindo a window
window_spec = Window.partitionBy("athlete_id","id","start_date").orderBy(col("ingestion_timestamp").desc())

# Aplicando row_number
df_ranked = df_bronze.withColumn("row_num", row_number().over(window_spec))

# Mantendo apenas a 1ª linha de cada grupo
df_deduplicated = df_ranked.filter(col("row_num") == 1).drop("row_num")
