In [0]:
%sql
DROP TABLE IF EXISTS uc_athlete_data.silver.strava_sub_activity;


In [0]:
#dbutils.fs.rm("abfss://silver@adlsathlete.dfs.core.windows.net/strava/activities/strava_activities/", recurse=True)
#dbutils.fs.rm("abfss://silver@adlsathlete.dfs.core.windows.net/", recurse=True)

In [0]:
# 📥 1. Leitura da tabela da Bronze
df_bronze = spark.table("uc_athlete_data.bronze.strava_sub_activity").filter("athlete_id IS NOT NULL")

In [0]:
# Filtrar registros com athlete_id nulo
df_errors = spark.read.table("uc_athlete_data.bronze.strava_sub_activity").filter("athlete_id IS NULL")

# Salvar em nova tabela de erros (sobrescrevendo ou adicionando, conforme o caso)
df_errors.write.format("delta") \
    .mode("append") \
    .saveAsTable("uc_athlete_data.bronze.strava_sub_activity_erros")


In [0]:
display(df_bronze)

In [0]:
%sql
select * from uc_athlete_data.bronze.strava_sub_activity_errors

In [0]:
%sql
select count(*), count(distinct id )
from uc_athlete_data.bronze.strava_sub_activity

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

# Definindo a window
window_spec = Window.partitionBy("athlete_id","id","start_date").orderBy(col("ingestion_timestamp").desc())

# Aplicando row_number
df_ranked = df_bronze.withColumn("row_num", row_number().over(window_spec))

# Mantendo apenas a 1ª linha de cada grupo
df_deduplicated = df_ranked.filter(col("row_num") == 1).drop("row_num")


In [0]:
display(df_deduplicated).top

In [0]:
# 🧼 2. Transformações e limpeza
from pyspark.sql.functions import col, from_unixtime, round, date_format
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import to_date, round, col,when,expr
from pyspark.sql.functions import col, round, to_timestamp, to_date, when, date_format
from pyspark.sql.functions import floor, round, format_string, col
from pyspark.sql.functions import input_file_name, regexp_extract

df_silver = (
    df_deduplicated
    .withColumn("start_date", to_timestamp("start_date", "yyyy-MM-dd'T'HH:mm:ss'Z'"))
    .withColumn("start_date", to_date("start_date"))
)

display(df_silver)
# 📤 3. Escrita na Silver como Delta
output_path = "abfss://silver@adlsathlete.dfs.core.windows.net/strava/sub_activity/"

df_silver.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(output_path)


In [0]:
%sql
drop table uc_athlete_data.silver.strava_sub_activity

In [0]:
%sql
CREATE TABLE IF NOT EXISTS uc_athlete_data.silver.strava_sub_activity
USING DELTA
LOCATION 'abfss://silver@adlsathlete.dfs.core.windows.net/strava/sub_activity/';


In [0]:
%sql
select id,elapsed_time, * from uc_athlete_data.silver.strava_sub_activity
--where start_date = '2025-01-03'

In [0]:
from pyspark.sql.functions import col, to_date, row_number
from pyspark.sql.window import Window

# Suponha que você já tenha um DataFrame chamado df (do silver_sub_activity)
# Converter a coluna de data para formato de data apenas
df = _sqldf .withColumn("activity_date", to_date(col("start_date_local")))

# Criar janela particionada por dia, ordenando pela maior duração (elapsed_time)
window_spec = Window.partitionBy("activity_date","sport_type").orderBy(col("calories").desc(),col("achievement_count").desc(), col("elapsed_time").desc())

# Adicionar coluna de ranking
df_ranked = df.withColumn("rank", row_number().over(window_spec))

# Filtrar apenas a primeira corrida de cada dia
df_one_per_day = df_ranked.filter(col("rank") == 1).drop("rank")
df_one_per_day.createOrReplaceTempView("temp_view")


In [0]:
%sql
create table uc_athlete_data.silver.sub_activity_final as
select sport_type, id, calories from temp_view
--where start_date = '2025-01-03'
where calories >0
order by 1

In [0]:
%sql
select id,* from uc_athlete_data.silver.strava_activities
where id in (13271032297,13271032301)
order by 1

In [0]:
%sql
select count(id), count(distinct id) from uc_athlete_data.silver.strava_sub_activity

In [0]:
%sql
create or replace temp view vw_strava_activities as
select *
from uc_athlete_data.silver.strava_activities
where start_date >= '2024-12-01'

In [0]:
%sql
select distinct a.id
from vw_strava_activities a
left join uc_athlete_data.silver.strava_sub_activity b
  on a.id = b.id
where b.id is null