In [0]:
from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable
from py_functions_silver import *
from pyspark.sql.functions import current_timestamp

BRONZE = "uc_athlete_data.bronze.strava_activities"
SILVER = "uc_athlete_data.silver.strava_activities"
CHECKPOINT = "abfss://silver@adlsathlete.dfs.core.windows.net/strava/activities/activities_checkpoint/"

# chave(s) de negócio — mude p/ composto se precisar, ex.: ["activity_id","lap_index"]
BUSINESS_KEYS = ["id"]
# colunas de ordenação para decidir o “vencedor” nos duplicados do micro-lote
ORDER_COLS = ["ingestion_timestamp"]  # adicione "updated_at" se existir

In [0]:
%sql
-- mesma estrutura, zero linhas
CREATE TABLE uc_athlete_data.silver.stage_strava_activities
AS
SELECT *
FROM uc_athlete_data.silver.strava_activities
WHERE 1 = 0;


In [0]:
bronze_stream = spark.readStream.table(BRONZE)

In [0]:
def _dedupe_microbatch(df):
    w = Window.partitionBy(*[F.col(c) for c in BUSINESS_KEYS]) \
         .orderBy(*[F.col(c).desc() for c in ORDER_COLS])
    return (df.withColumn("row_id", F.row_number().over(w))
              .filter(F.col("row_id")==1)
              .drop("row_id"))

In [0]:
# --- Compose (aplica tudo que você mostrou) -----------------------------------

def apply_all_silver_calcs(df: DataFrame,
                           *,
                           type_col: str = "type",
                           distance_col: str = "distance",
                           average_speed_col: str = "average_speed",
                           moving_time_col: str = "moving_time",
                           elapsed_time_col: str = "elapsed_time",
                           start_date_col: str = "start_date",
                           non_run_value_for_pace=0  # para manter igual ao seu snippet
                           ) -> DataFrame:
    """
    Aplica TODAS as transformações do snippet original.
    Retorna um novo DataFrame com:
      - start_date (date)
      - distance_km
      - average_speed_kmh
      - pace_min_km
      - pace_min_km_moving_time
      - tempo_real (HH:MM:SS)
      - pace_min_km_new
      - pace_strava (M:SS)
      - dia_semana
    """
    return (
        df
        .transform(lambda d: add_start_date(d, src_col=start_date_col, out_col="start_date"))
        .transform(lambda d: add_distance_km(d, distance_col=distance_col, out_col="distance_km", decimals=2))
        .transform(lambda d: add_average_speed_kmh(d, avg_speed_col=average_speed_col, out_col="average_speed_kmh", decimals=2))
        .transform(lambda d: add_pace_min_km(d,
                                             elapsed_time_col=elapsed_time_col,
                                             distance_col=distance_col,
                                             type_col=type_col,
                                             out_col="pace_min_km",
                                             decimals=2,
                                             only_for_run=True,
                                             non_run_value=non_run_value_for_pace))
        .transform(lambda d: add_pace_min_km_moving_time(d,
                                                         moving_time_col=moving_time_col,
                                                         distance_col=distance_col,
                                                         type_col=type_col,
                                                         out_col="pace_min_km_moving_time",
                                                         decimals=2,
                                                         only_for_run=True,
                                                         non_run_value=non_run_value_for_pace))
        .transform(lambda d: add_tempo_real(d, seconds_col=moving_time_col, out_col="tempo_real"))
        .transform(lambda d: add_pace_min_km_new(d,
                                                 moving_time_col=moving_time_col,
                                                 distance_col=distance_col,
                                                 out_col="pace_min_km_new",
                                                 decimals=3))
        .transform(lambda d: add_pace_strava(d, pace_min_col="pace_min_km_new", out_col="pace_strava"))
        .transform(lambda d: add_dia_semana(d, date_col="start_date", out_col="dia_semana", pattern="E"))
    )

In [0]:
from datacontract.data_contract import DataContract

def validate_df_with_contract(df, contract_path, temp_view_name):
    view_name = "strava_activities"        # precisa bater com o nome do modelo no YAML
    (df
     .select("id","start_date","distance","pace","week_day")
     .createOrReplaceTempView(view_name))


    #df.createOrReplaceTempView(temp_view_name)  # deve ter o mesmo nome do modelo no YAML
    dc = DataContract(data_contract_file=contract_path, spark=spark)
    
    run = dc.test()

    
    print(run.pretty())             # resumo legível (pass/fail por check)
    print("PASSOU?", run.has_passed())

    if not run.has_passed():
        # opcional: print(run.result) ou exportar JUnit no CI
        raise Exception("Data contract failed for micro-batch")


In [0]:
def upsert_data(microBatchDF, batch):
    microBatchDF = _dedupe_microbatch(microBatchDF)
    microBatchDF = apply_all_silver_calcs(microBatchDF)
    #microBatchDF = add_silver_ingestion(microBatchDF)

        # valida o micro-lote (pré-write)
    run = validate_df_with_contract(
        microBatchDF.select("id","start_date","distance_km","pace_min_km","dia_semana"),
        "/Workspace/Users/lgcpazdb892@outlook.com/EngineData/athlete-data-platform/notebooks/dataContract.yaml",
        "dataframe"
    )

    microBatchDF.createOrReplaceTempView("activities_microbatch")
    
    sql_query = """
                MERGE INTO uc_athlete_data.silver.stage_strava_activities A
                USING activities_microbatch B
                ON A.ID = b.ID AND A.INGESTION_TIMESTAMP = B.INGESTION_TIMESTAMP
                WHEN NOT MATCHED THEN INSERT * 
                """  

    #microBatchDF.sparkSession.sql(sql_query)

In [0]:

query = (bronze_stream.writeStream
                 .foreachBatch(upsert_data)
                 .option("checkpointLocation", CHECKPOINT)
                 .trigger(availableNow=True)
                 .start())
                 
query.awaitTermination()

In [0]:
%sql
select * from uc_athlete_data.silver.stage_strava_activities

In [0]:
%sql
select * from uc_athlete_data.silver.strava_activities WHERE id in (14165154538,60602,60601,5050,50501);

In [0]:
%sql
select * from uc_athlete_data.bronze.strava_activities  WHERE id in (14165154538,60603)
order by id asc;


In [0]:
%sql
describe history uc_athlete_data.silver.stage_silver

In [0]:
from delta.tables import DeltaTable

def upsert_data(microBatchDF, batch_id):
    delta_table = DeltaTable.forName(
        microBatchDF.sparkSession,
        "uc_athlete_data.silver.stage_silver"
    )
    (
        delta_table.alias("A")
        .merge(
            microBatchDF.alias("B"),
            "A.ID = B.ID AND A.INGESTION_TIMESTAMP = B.INGESTION_TIMESTAMP"
        )
        .whenNotMatchedInsertAll()
        .execute()
    )

In [0]:
pip install datacontract-cli[databricks]

In [0]:
%restart_python

In [0]:
bronze_stream = spark.readStream.table(SILVER)
bronze_stream.createOrReplaceTempView("strava_activities")

def validate_with_contract_df(df, contract_path, model_view="strava_activities"):
    df.createOrReplaceTempView(model_view)  # nome deve bater com o modelo no YAML
    dc = DataContract(data_contract_file=contract_path, spark=spark)
    run = dc.test(server="production")
    print(run.pretty())
    if not run.has_passed():
        raise Exception("Data contract (DF) reprovado")

In [0]:
display(df)

O código abaixo roda um datacontract na silver delta. esta funcionando. temos a lógica feita - Proximo passo: encaixar no processo

In [0]:
from datacontract.data_contract import DataContract

# defina a tabela que você quer usar no teste:
SILVER = "uc_athlete_data.silver.strava_activities"  # ajuste se for outra

# leia em batch (não streaming)
df = spark.read.table(SILVER)

def validate_with_contract_df(df, contract_path, model_view="strava_activities"):
    # dica: selecione só as colunas definidas no YAML pra evitar schema mismatch
    # cols = ["id","athlete_id","start_date","start_date_local","timezone","sport_type",
    #         "elapsed_time","moving_time","distance_km","average_speed_kmh","total_elevation_gain",
    #         "has_heartrate","average_heartrate","max_heartrate","average_cadence","week_day",
    #         "ingestion_timestamp"]
    # df = df.select(*[c for c in cols if c in df.columns])

    df.createOrReplaceTempView(model_view)  # precisa bater com o nome do modelo no YAML
    dc = DataContract(data_contract_file=contract_path, spark=spark)
    run = dc.test()
    print(run.pretty())
    if not run.has_passed():
        raise Exception("Data contract (DF) reprovado")

# use:
validate_with_contract_df(
    df,
    "/Workspace/Users/lgcpazdb892@outlook.com/EngineData/athlete-data-platform/notebooks/dataContract.yaml",
    model_view="strava_activities"
)


In [0]:
# --- Configs -----------------------------
CONFIG = {
    "ENABLE_PRE_WRITE_CONTRACT":  True,   # valida DF (temp view) antes do write
    "ENABLE_POST_WRITE_CONTRACT": True,   # valida a tabela após o write
    "FAIL_ON_REJECTS":            False,  # se True: lote falha quando houver rejeitos
    "ENFORCEMENT_MODE":           "fail", # "fail" (derruba) ou "warn" (apenas log)
    "MODEL_VIEW_NAME":            "strava_activities",  # deve bater com models.<nome> no YAML
    "CONTRACT_DF_PATH":           "/Workspace/Repos/.../contracts/strava_activities_dataframe.yaml",
    "CONTRACT_TABLE_PATH":        "/Workspace/Repos/.../contracts/strava_activities_table.yaml",
    "SILVER_TABLE":               "uc_athlete_data.silver.strava_activities",
    "REJECTS_TABLE":              "uc_athlete_data.silver_rejects.strava_activities",
}


In [0]:
from pyspark.sql import functions as F, types as T
from datacontract.data_contract import DataContract

# ---------------- Enforcement ----------------
def enforce_or_warn(condition: bool, message: str, cfg=CONFIG):
    """Se ENFORCEMENT_MODE='fail' e condition=False => raise. Caso contrário, loga warn."""
    if condition:
        return
    if cfg["ENFORCEMENT_MODE"].lower() == "fail":
        raise Exception(message)
    else:
        print(f"[WARN] {message}")

# ---------------- Contracts -------------------
CONTRACT_COLS = ["id","athlete_id","start_date","sport_type",
                 "elapsed_time","moving_time","distance_km",
                 "average_speed_kmh","dia_semana"]

def validate_with_contract_df(df, cfg=CONFIG):
    if not cfg["ENABLE_PRE_WRITE_CONTRACT"]:
        print("[INFO] Pré-write contract desabilitado.")
        return
    view = cfg["MODEL_VIEW_NAME"]
    # seleciona apenas colunas esperadas (evita mismatch)
    cols = [c for c in CONTRACT_COLS if c in df.columns]
    df.select(*cols).createOrReplaceTempView(view)

    dc = DataContract(data_contract_file=cfg["CONTRACT_DF_PATH"], spark=spark)
    run = dc.test(server="production")
    print(run.pretty())
    enforce_or_warn(run.has_passed(), "Data contract (DF) reprovado", cfg)

def validate_with_contract_table(cfg=CONFIG):
    if not cfg["ENABLE_POST_WRITE_CONTRACT"]:
        print("[INFO] Pós-write contract desabilitado.")
        return
    dc = DataContract(data_contract_file=cfg["CONTRACT_TABLE_PATH"], spark=spark)
    run = dc.test(server="production")
    print(run.pretty())
    enforce_or_warn(run.has_passed(), "Data contract (Tabela) reprovado", cfg)

# ---------------- Split good / rejects --------
REQUIRED = ["id","athlete_id","start_date","sport_type"]
NUM_MIN0 = ["elapsed_time","moving_time","distance_km","average_speed_kmh"]

def cast_and_validate(df, cfg=CONFIG):
    # Tipagem forte
    df = (df
          .withColumn("id", F.col("id").cast(T.LongType()))
          .withColumn("athlete_id", F.col("athlete_id").cast(T.LongType()))
          .withColumn("start_date", F.to_timestamp("start_date"))
          .withColumn("elapsed_time", F.col("elapsed_time").cast(T.LongType()))
          .withColumn("moving_time", F.col("moving_time").cast(T.LongType()))
          .withColumn("distance_km", F.col("distance_km").cast(T.DoubleType()))
          .withColumn("average_speed_kmh", F.col("average_speed_kmh").cast(T.DoubleType()))
          .withColumn("dia_semana", F.col("dia_semana").cast(T.StringType()))
         )

    # Regras base
    cond = F.lit(True)
    for c in REQUIRED:  cond = cond & F.col(c).isNotNull()
    for c in NUM_MIN0:  cond = cond & (F.col(c).isNull() | (F.col(c) >= 0))
    cond = cond & (F.col("moving_time").isNull() | F.col("elapsed_time").isNull() | (F.col("moving_time") <= F.col("elapsed_time")))
    cond = cond & (F.col("dia_semana").isNull())

    good = df.where(cond)
    bad  = (df.where(~cond)
              .withColumn("reject_reason",
                  F.when(F.col("id").isNull(), "id_null")
                   .when(F.col("athlete_id").isNull(), "athlete_id_null")
                   .when(F.col("start_date").isNull(), "start_date_null")
                   .when(F.col("sport_type").isNull(), "sport_type_null")
                   .when(F.col("elapsed_time") < 0, "elapsed_time_neg")
                   .when(F.col("moving_time") < 0, "moving_time_neg")
                   .when(F.col("moving_time") > F.col("elapsed_time"), "moving>elapsed")
                   .when((F.col("dia_semana").isNull()), "dia_semana_null")
                   .otherwise("invalid_generic")
              ))

    return good, bad

def handle_rejects(bad_df, cfg=CONFIG):
    if bad_df is None or bad_df.rdd.isEmpty():
        print("[INFO] Nenhum rejeito no micro-lote.")
        return
    (bad_df.withColumn("ingestion_timestamp", F.current_timestamp())
           .write.mode("append").format("delta")
           .saveAsTable(cfg["REJECTS_TABLE"]))
    print("[WARN] Rejeitos gravados em", cfg["REJECTS_TABLE"])
    enforce_or_warn(not cfg["FAIL_ON_REJECTS"], "Rejeitos detectados; lote abortado", cfg)


In [0]:
df = spark.read.table("uc_athlete_data.silver.strava_activities")  # ou outra origem

good, bad = cast_and_validate(df, CONFIG)
handle_rejects(bad, CONFIG)
validate_with_contract_df(good, CONFIG)       # só roda se ENABLE_PRE_WRITE_CONTRACT=True
# ... simule um write aqui, se quiser ...
validate_with_contract_table(CONFIG)          # só roda se ENABLE_POST_WRITE_CONTRACT=True


In [0]:
display(good)

In [0]:
%sql
-- 1) Cria o schema de rejeitos (se ainda não existir)
CREATE SCHEMA IF NOT EXISTS uc_athlete_data.silver_rejects;

-- 2) Cria a tabela de rejeitos
CREATE TABLE IF NOT EXISTS uc_athlete_data.silver_rejects.strava_activities (
  id                   BIGINT,
  athlete_id           BIGINT,
  start_date           TIMESTAMP,
  start_date_local     TIMESTAMP,
  timezone             STRING,
  sport_type           STRING,
  elapsed_time         BIGINT,
  moving_time          BIGINT,
  distance_km          DOUBLE,
  average_speed_kmh    DOUBLE,
  total_elevation_gain DOUBLE,
  has_heartrate        BOOLEAN,
  average_heartrate    INT,
  max_heartrate        INT,
  average_cadence      DOUBLE,
  dia_semana           STRING,
  -- campos de auditoria da rejeição:
  reject_reason        STRING,        -- motivo da rejeição (ex.: required_null, moving>elapsed, etc.)
  ingestion_timestamp  TIMESTAMP      -- carimbo do momento que salvou na tabela de rejeitos
)
USING DELTA;
