In [0]:
from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable
from py_functions_silver import *
from pyspark.sql.functions import current_timestamp
from dataquality_rules_strava import *


BRONZE = "uc_athlete_data.bronze.strava_sub_activity"
CHECKPOINT = "abfss://silver@adlsathlete.dfs.core.windows.net/strava/sub_activity/sub_activity_checkpoint/"

# chave(s) de negócio — mude p/ composto se precisar, ex.: ["activity_id","lap_index"]
BUSINESS_KEYS = ["athlete_id","id"]
# colunas de ordenação para decidir o “vencedor” nos duplicados do micro-lote
ORDER_COLS = ["ingestion_timestamp"]  # adicione "updated_at" se existir

config = get_rules_subactivity()
rules = config["rules"]
reject_table = config["reject_table"]

In [0]:
# --- Compose (aplica tudo que você mostrou) -----------------------------------

def apply_all_silver_calcs(df: DataFrame,
                           *,
                           start_date_col: str = "start_date",
                           ) -> DataFrame:
    """
    Aplica TODAS as transformações do snippet original.
    Retorna um novo DataFrame com:
      - start_date (date)
    """
    return (
        df
        .transform(lambda d: add_start_date(d, src_col=start_date_col, out_col="start_date"))        
    )

In [0]:
def upsert_data(microBatchDF, batch):
    microBatchDF.createOrReplaceTempView("sub_activities_microbatch")
    
    sql_query = """
                MERGE INTO uc_athlete_data.silver.strava_sub_activity A
                USING sub_activities_microbatch  B
                ON A.athlete_id = b.athlete_id
                   AND A.ID = B.ID
                   AND A.START_DATE = B.START_DATE
                WHEN NOT MATCHED THEN INSERT * 
                """  

    microBatchDF.sparkSession.sql(sql_query)

In [0]:
def load_data(microBatchDF, batch):   
    
    microBatchDF = dedupe_microbatch(microBatchDF,BUSINESS_KEYS,ORDER_COLS)
    print("dedup ok ")
    microBatchDF = apply_all_silver_calcs(microBatchDF)
    print("calcs ok ")
    df_clean = assert_quality(microBatchDF,rules,reject_table)
    print("clean ok ")
    upsert_data(df_clean, batch)
    print("upsert ok ")

In [0]:
bronze_stream = spark.readStream.table(BRONZE)

query = (bronze_stream.writeStream
                 .foreachBatch(load_data)
                 .option("checkpointLocation", CHECKPOINT)
                 .trigger(availableNow=True)
                 .start())
                 
query.awaitTermination()

In [0]:
%sql
SELECT count(*) as total, reject_reason, date(reject_timestamp) as dt
FROM uc_athlete_data.silver_rejects.strava_sub_activity
GROUP BY 2,3
ORDER BY 3 DESC;


In [0]:
%sql
select * from uc_athlete_data.silver.strava_sub_activity
order by ingestion_timestamp desc

In [0]:
%sql
select * from uc_athlete_data.bronze.strava_sub_activity
where id = 200002
order by ingestion_timestamp desc

In [0]:
%sql
select * from uc_athlete_data.silver_rejects.strava_sub_activity
order by reject_timestamp desc
