In [0]:
from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable
from py_functions_silver import *
from pyspark.sql.functions import current_timestamp

BRONZE = "uc_athlete_data.bronze.strava_activities"
SILVER = "uc_athlete_data.silver.strava_activities"
CHECKPOINT = "abfss://silver@adlsathlete.dfs.core.windows.net/strava/activities/activities_checkpoint/"

# chave(s) de negócio — mude p/ composto se precisar, ex.: ["activity_id","lap_index"]
BUSINESS_KEYS = ["id"]
# colunas de ordenação para decidir o “vencedor” nos duplicados do micro-lote
ORDER_COLS = ["ingestion_timestamp"]  # adicione "updated_at" se existir

In [0]:
%sql
-- mesma estrutura, zero linhas
CREATE TABLE uc_athlete_data.silver.stage_strava_activities
AS
SELECT *
FROM uc_athlete_data.silver.strava_activities
WHERE 1 = 0;


In [0]:
bronze_stream = spark.readStream.table(BRONZE)

In [0]:
def _dedupe_microbatch(df):
    w = Window.partitionBy(*[F.col(c) for c in BUSINESS_KEYS]) \
         .orderBy(*[F.col(c).desc() for c in ORDER_COLS])
    return (df.withColumn("row_id", F.row_number().over(w))
              .filter(F.col("row_id")==1)
              .drop("row_id"))

In [0]:
# --- Compose (aplica tudo que você mostrou) -----------------------------------

def apply_all_silver_calcs(df: DataFrame,
                           *,
                           type_col: str = "type",
                           distance_col: str = "distance",
                           average_speed_col: str = "average_speed",
                           moving_time_col: str = "moving_time",
                           elapsed_time_col: str = "elapsed_time",
                           start_date_col: str = "start_date",
                           non_run_value_for_pace=0  # para manter igual ao seu snippet
                           ) -> DataFrame:
    """
    Aplica TODAS as transformações do snippet original.
    Retorna um novo DataFrame com:
      - start_date (date)
      - distance_km
      - average_speed_kmh
      - pace_min_km
      - pace_min_km_moving_time
      - tempo_real (HH:MM:SS)
      - pace_min_km_new
      - pace_strava (M:SS)
      - dia_semana
    """
    return (
        df
        .transform(lambda d: add_start_date(d, src_col=start_date_col, out_col="start_date"))
        .transform(lambda d: add_distance_km(d, distance_col=distance_col, out_col="distance_km", decimals=2))
        .transform(lambda d: add_average_speed_kmh(d, avg_speed_col=average_speed_col, out_col="average_speed_kmh", decimals=2))
        .transform(lambda d: add_pace_min_km(d,
                                             elapsed_time_col=elapsed_time_col,
                                             distance_col=distance_col,
                                             type_col=type_col,
                                             out_col="pace_min_km",
                                             decimals=2,
                                             only_for_run=True,
                                             non_run_value=non_run_value_for_pace))
        .transform(lambda d: add_pace_min_km_moving_time(d,
                                                         moving_time_col=moving_time_col,
                                                         distance_col=distance_col,
                                                         type_col=type_col,
                                                         out_col="pace_min_km_moving_time",
                                                         decimals=2,
                                                         only_for_run=True,
                                                         non_run_value=non_run_value_for_pace))
        .transform(lambda d: add_tempo_real(d, seconds_col=moving_time_col, out_col="tempo_real"))
        .transform(lambda d: add_pace_min_km_new(d,
                                                 moving_time_col=moving_time_col,
                                                 distance_col=distance_col,
                                                 out_col="pace_min_km_new",
                                                 decimals=3))
        .transform(lambda d: add_pace_strava(d, pace_min_col="pace_min_km_new", out_col="pace_strava"))
        .transform(lambda d: add_dia_semana(d, date_col="start_date", out_col="dia_semana", pattern="E"))
    )

In [0]:
from datacontract.data_contract import DataContract

def validate_df_with_contract(df, contract_path, temp_view_name):
    df.createOrReplaceTempView(temp_view_name)  # deve ter o mesmo nome do modelo no YAML
    dc = DataContract(data_contract_file=contract_path, spark=spark)




In [0]:
validate_df_with_contract

In [0]:
from datacontract.data_contract import DataContract

dc = DataContract(
    data_contract_file="/Workspace/Users/lgcpazdb892@outlook.com/EngineData/athlete-data-platform/notebooks/dataContract.yaml",
    spark=spark
)
run = dc.test()                 # roda os testes

print(run.pretty())             # resumo legível (pass/fail por check)
print("PASSOU?", run.has_passed())
# só aqui decide falhar o job:
if not run.has_passed():
    # opcional: logar primeiro, depois falhar
    raise Exception("Data contract reprovado")


In [0]:
def upsert_data(microBatchDF, batch):
    microBatchDF = _dedupe_microbatch(microBatchDF)
    microBatchDF = apply_all_silver_calcs(microBatchDF)
    #microBatchDF = add_silver_ingestion(microBatchDF)

        # valida o micro-lote (pré-write)
    run = validate_df_with_contract(
        microBatchDF.select("id","start_date","distance_km","pace_min_km","dia_semana"),
        "/Workspace/Users/lgcpazdb892@outlook.com/EngineData/athlete-data-platform/notebooks/dataContract.yaml",
        "dataframe"
    )
    print(run.pretty())             # resumo legível (pass/fail por check)
    print("PASSOU?", run.has_passed())  

    # só aqui decide falhar o job:
    if not run.has_passed():
        # opcional: logar primeiro, depois falhar
        raise Exception("Data contract reprovado")

    microBatchDF.createOrReplaceTempView("activities_microbatch")
    
    sql_query = """
                MERGE INTO uc_athlete_data.silver.stage_strava_activities A
                USING activities_microbatch B
                ON A.ID = b.ID AND A.INGESTION_TIMESTAMP = B.INGESTION_TIMESTAMP
                WHEN NOT MATCHED THEN INSERT * 
                """  

    #microBatchDF.sparkSession.sql(sql_query)

In [0]:

query = (bronze_stream.writeStream
                 .foreachBatch(upsert_data)
                 .option("checkpointLocation", CHECKPOINT)
                 .trigger(availableNow=True)
                 .start())
                 
query.awaitTermination()

In [0]:
%sql
select * from uc_athlete_data.silver.stage_strava_activities

In [0]:
%sql
select * from uc_athlete_data.silver.strava_activities WHERE id in (14165154538,60602,60601,5050,50501);

In [0]:
%sql
select * from uc_athlete_data.bronze.strava_activities  WHERE id in (14165154538,60603)
order by id asc;


In [0]:
%sql
describe history uc_athlete_data.silver.stage_silver

In [0]:
from delta.tables import DeltaTable

def upsert_data(microBatchDF, batch_id):
    delta_table = DeltaTable.forName(
        microBatchDF.sparkSession,
        "uc_athlete_data.silver.stage_silver"
    )
    (
        delta_table.alias("A")
        .merge(
            microBatchDF.alias("B"),
            "A.ID = B.ID AND A.INGESTION_TIMESTAMP = B.INGESTION_TIMESTAMP"
        )
        .whenNotMatchedInsertAll()
        .execute()
    )

In [0]:
pip install datacontract-cli[databricks]

In [0]:
%restart_python