## **FRESHNESS**

## **DATA **FRESHNESS****

In [0]:
%sql
WITH latest_per_layer AS (
  -- bronze
  SELECT
    'BRONZE' AS layer,
    RETAILER,
    CATEGORY,
    MAX(DATE) AS latest_date
  FROM `workspace`.`default`.`bronze_fnb_sales`
  WHERE RETAILER IS NOT NULL AND CATEGORY IS NOT NULL
  GROUP BY RETAILER, CATEGORY

  UNION ALL

  -- silver
  SELECT
    'SILVER' AS layer,
    RETAILER,
    CATEGORY,
    MAX(DATE) AS latest_date
  FROM `workspace`.`default`.`silver_fnb_sales`
  WHERE RETAILER IS NOT NULL AND CATEGORY IS NOT NULL
  GROUP BY RETAILER, CATEGORY

  UNION ALL

  -- gold
  SELECT
    'GOLD' AS layer,
    RETAILER,
    CATEGORY,
    MAX(DATE) AS latest_date
  FROM `workspace`.`default`.`gold_fnb_sales`
  WHERE RETAILER IS NOT NULL AND CATEGORY IS NOT NULL
  GROUP BY RETAILER, CATEGORY
)

SELECT
  layer,
  RETAILER,
  CATEGORY,
  latest_date,

  
  round( (unix_timestamp(current_timestamp()) - unix_timestamp(cast(latest_date AS timestamp))) / 3600.0 , 4) AS freshness_lag_hours,
  round( (unix_timestamp(current_timestamp()) - unix_timestamp(cast(latest_date AS timestamp))) / 86400.0 , 4) AS freshness_lag_days,
  round( (unix_timestamp(current_timestamp()) - unix_timestamp(cast(latest_date AS timestamp))) / (86400.0*7) , 4) AS freshness_lag_weeks,
  CASE
    WHEN (unix_timestamp(current_timestamp()) - unix_timestamp(cast(latest_date AS timestamp))) / 86400.0 >40  THEN 'STALE'
    WHEN (unix_timestamp(current_timestamp()) - unix_timestamp(cast(latest_date AS timestamp))) / 86400.0 > 26 THEN 'LATE'
    ELSE 'OK'
  END AS health
FROM latest_per_layer
ORDER BY freshness_lag_days DESC, layer, RETAILER, CATEGORY;


In [0]:
# Save _sqldf as a Delta table in workspace.default
from pyspark.sql import functions as F
from datetime import datetime
import uuid, os

df = _sqldf  # rename for clarity
df = df.withColumn("metric_run_id", F.lit(str(uuid.uuid4())))
df = df.withColumn("run_ts", F.current_timestamp())

CAT, SCH, TBL = "workspace", "default", "freshness_datafreshness_metrics"
FULL = f"{CAT}.{SCH}.{TBL}"

try:
    # df.write.format("delta").mode("overwrite").saveAsTable(FULL)
    print(f"✅ Created/overwritten table: {FULL}")
except Exception as e:
    print(f"⚠️ Write to {FULL} failed: {str(e).splitlines()[0]}")
    # fallback to your user folder (always writable)
    try:
        user = spark.sql("SELECT current_user() as u").collect()[0]["u"]
    except Exception:
        user = os.environ.get("USER") or "unknown_user"
    safe_user = user.replace("@","_at_").replace(" ","_")
    path = f"/Users/{safe_user}/do_tool/{TBL}_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}"
    # df.write.format("delta").mode("overwrite").save(path)
    print(f"✅ Saved Delta files to: {path}\n\nAsk admin to register it with:\nCREATE TABLE {FULL} USING DELTA LOCATION '{path}';")


removal for volume


In [0]:
%sql
SELECT *
FROM workspace.default.silver_fnb_sales
WHERE RETAILER = 'WALMART' AND CATEGORY ='Beverages' AND SEGMENT ='Fruit Juices' ; 

## **TABLE FRESHNESS**

In [0]:
# Fixed: timezone-normalized minimal version of your freshness cell
import re
from datetime import datetime, timezone, timedelta
from pyspark.sql import functions as F

DEFAULT_DB = "default"
tables = {
    "bronze": f"{DEFAULT_DB}.bronze_fnb_sales",
    "silver": f"{DEFAULT_DB}.silver_fnb_sales",
    "gold":   f"{DEFAULT_DB}.gold_fnb_sales"
}

def to_py_dt(spark_ts):
    if spark_ts is None:
        return None
    # If it's already a python datetime
    if isinstance(spark_ts, datetime):
        dt = spark_ts
    else:
        try:
            dt = spark_ts.toPyDateTime()
        except Exception:
            try:
                dt = datetime.fromisoformat(str(spark_ts))
            except Exception:
                return None
    # return dt (may be naive or aware)
    return dt

# get now from Spark and normalize to tz-aware UTC
now = spark.sql("SELECT current_timestamp() as ts").collect()[0]["ts"]
now_py = to_py_dt(now)
if now_py is None:
    now_py = datetime.now(timezone.utc)
elif now_py.tzinfo is None:
    now_py = now_py.replace(tzinfo=timezone.utc)
else:
    now_py = now_py.astimezone(timezone.utc)

def get_last_updated(full_table_name):
    notes = []
    # 1) DESCRIBE HISTORY
    try:
        hist_df = spark.sql(f"DESCRIBE HISTORY {full_table_name} LIMIT 1")
        rows = hist_df.collect()
        if rows:
            ts = rows[0]["timestamp"]
            py_ts = to_py_dt(ts)
            if py_ts is not None:
                # normalize to tz-aware UTC
                if py_ts.tzinfo is None:
                    py_ts = py_ts.replace(tzinfo=timezone.utc)
                else:
                    py_ts = py_ts.astimezone(timezone.utc)
                return {"table": full_table_name, "last_updated": py_ts, "method": "delta_describe_history", "detail": None, "notes": None}
    except Exception as e:
        notes.append(f"describe_history_failed:{e}")

    # 2) max timestamp-like column
    try:
        df = spark.table(full_table_name)
        cand_cols = [c for c,t in df.dtypes if t.startswith("timestamp") or t.startswith("date")]
        if not cand_cols:
            cand_cols = [c for c in df.columns if re.search(r"(updated|modified|ingest|inserted|event|ts|time|dt)", c, re.I)]
        best_ts = None
        best_col = None
        for c in cand_cols:
            try:
                mx = df.agg(F.max(F.col(c)).alias("mx")).collect()[0]["mx"]
                mx_py = to_py_dt(mx)
                if mx_py is not None:
                    if mx_py.tzinfo is None:
                        mx_py = mx_py.replace(tzinfo=timezone.utc)
                    else:
                        mx_py = mx_py.astimezone(timezone.utc)
                    if best_ts is None or mx_py > best_ts:
                        best_ts = mx_py
                        best_col = c
            except Exception:
                continue
        if best_ts is not None:
            return {"table": full_table_name, "last_updated": best_ts, "method": "max_timestamp_column", "detail": best_col, "notes": None}
    except Exception as e:
        notes.append(f"max_timestamp_col_failed:{e}")

    # 3) file modification times (if available)
    try:
        detail_rows = spark.sql(f"DESCRIBE DETAIL {full_table_name}").collect()
        if detail_rows:
            loc = detail_rows[0].asDict().get("location") or detail_rows[0].asDict().get("Location")
            if loc:
                def latest_mod_from_path(path):
                    try:
                        files = dbutils.fs.ls(path)
                    except Exception:
                        return None
                    max_ts = None
                    for f in files:
                        try:
                            if f.isDir():
                                child = latest_mod_from_path(f.path)
                                if child and (max_ts is None or child > max_ts):
                                    max_ts = child
                            else:
                                mt = getattr(f, "modificationTime", None)
                                if mt:
                                    dt = datetime.fromtimestamp(mt/1000.0, tz=timezone.utc)
                                    if max_ts is None or dt > max_ts:
                                        max_ts = dt
                        except Exception:
                            continue
                    return max_ts
                latest = latest_mod_from_path(loc)
                if latest:
                    return {"table": full_table_name, "last_updated": latest, "method": "file_mod_time", "detail": loc, "notes": None}
                else:
                    notes.append("no_file_mod_time_found")
            else:
                notes.append("no_location_in_describe_detail")
    except Exception as e:
        notes.append(f"describe_detail_failed:{e}")

    return {"table": full_table_name, "last_updated": None, "method": None, "detail": None, "notes": ";".join(notes) or None}

# ---------------- Replace previous freshness loop with deterministic v1 vs fixed target ----------------
from datetime import datetime, timezone
from zoneinfo import ZoneInfo
from pyspark.sql import types as T

# Target: Friday, 14 Nov 2025 07:00 IST
target_local = datetime(2025, 11, 14, 7, 0, 0, tzinfo=ZoneInfo("Asia/Kolkata"))
target_utc = target_local.astimezone(ZoneInfo("UTC"))

# health thresholds (days)
STALE_DAYS = 4.0
LATE_DAYS = 2.0

def get_version_timestamp(full_table_name, version=1):
    """Return version N commit timestamp as timezone-aware UTC datetime, or None if not found."""
    try:
        hist = spark.sql(f"DESCRIBE HISTORY {full_table_name}").select("version", "timestamp").collect()
        for r in hist:
            v = r.asDict().get("version")
            if v is None:
                continue
            if int(v) == int(version):
                ts = r.asDict().get("timestamp")
                return to_py_dt(ts) if to_py_dt(ts) is None else (
                    to_py_dt(ts).replace(tzinfo=timezone.utc) if to_py_dt(ts).tzinfo is None else to_py_dt(ts).astimezone(timezone.utc)
                )
        return None
    except Exception:
        return None

def classify_health(days_diff):
    if days_diff is None:
        return "UNKNOWN"
    if days_diff > STALE_DAYS:
        return "STALE"
    if days_diff > LATE_DAYS:
        return "LATE"
    return "OK"

rows = []
for layer, tbl in tables.items():
    # get version 1 timestamp (UTC)
    v1_ts = get_version_timestamp(tbl, version=1)
    if v1_ts is not None:
        # ensure timezone-aware UTC
        if v1_ts.tzinfo is None:
            v1_ts = v1_ts.replace(tzinfo=timezone.utc)
        else:
            v1_ts = v1_ts.astimezone(timezone.utc)
        # compute lag = target_utc - v1_ts
        delta = target_utc - v1_ts
        freshness_lag_seconds = int(delta.total_seconds())
        freshness_lag_hours = freshness_lag_seconds / 3600.0
        freshness_lag_days = freshness_lag_seconds / 86400.0
        freshness_lag_weeks = freshness_lag_seconds / 604800.0
        health = classify_health(freshness_lag_days)
        last_used_ts = v1_ts  # version1 time used as baseline
        detection_method = "delta_describe_history_v1"
    else:
        freshness_lag_seconds = None
        freshness_lag_hours = None
        freshness_lag_days = None
        freshness_lag_weeks = None
        health = "UNKNOWN"
        last_used_ts = None
        detection_method = None

    rows.append({
        "layer": layer,
        "table": tbl,
        "last_updated": last_used_ts,
        "detection_method": detection_method,
        "detection_detail": f"compared_against_{target_local.isoformat()}_IST",
        "freshness_lag_weeks": freshness_lag_weeks,
        "freshness_lag_days": freshness_lag_days,
        "freshness_lag_hours": freshness_lag_hours,
        "health": health,
        "notes": None
    })

# Build results DataFrame (same schema as before)
schema = T.StructType([
    T.StructField("layer", T.StringType(), True),
    T.StructField("table", T.StringType(), True),
    T.StructField("last_updated", T.TimestampType(), True),
    T.StructField("freshness_lag_weeks", T.DoubleType(), True),
    T.StructField("freshness_lag_days", T.DoubleType(), True),
    T.StructField("freshness_lag_hours", T.DoubleType(), True),
    T.StructField("health", T.StringType(), True),
    T.StructField("detection_method", T.StringType(), True),
    T.StructField("detection_detail", T.StringType(), True),
    T.StructField("notes", T.StringType(), True)
])

# convert python datetimes to tuples (spark will accept datetime objects for timestamp fields)
spark_rows = []
for r in rows:
    spark_rows.append((
        r["layer"],
        r["table"],
        r["last_updated"],
        r["freshness_lag_weeks"],
        r["freshness_lag_days"],
        r["freshness_lag_hours"],
        r["health"],
        r["detection_method"],
        r["detection_detail"],
        r["notes"]
    ))

metrics_df = spark.createDataFrame(spark_rows, schema=schema)
# add end_to_end (use v1 gold - v1 bronze if available)
v1_bronze = get_version_timestamp(tables["bronze"], version=1)
v1_gold = get_version_timestamp(tables["gold"], version=1)
if v1_bronze and v1_gold:
    e2e_seconds = int((v1_gold - v1_bronze).total_seconds())
    e2e_days = e2e_seconds / 86400.0
else:
    e2e_seconds = None
    e2e_days = None

from pyspark.sql import functions as F
metrics_df = metrics_df.withColumn("end_to_end_seconds", F.lit(e2e_seconds)).withColumn("end_to_end_days", F.lit(e2e_days))

display(metrics_df)
print("Compared version 1 timestamps against target:", target_local.isoformat(), " (IST)")

display(metrics_df)

print("Summary (Python view):")
for r in rows:
    print(r)
print(f"end_to_end_seconds (gold - bronze): {e2e_seconds}, end_to_end_days: {e2e_days}")


In [0]:

from datetime import datetime
import uuid, os

df = metrics_df
CAT, SCH, TBL = "workspace", "default", "freshness_tablefreshness"
FULL = f"{CAT}.{SCH}.{TBL}"

try:
    # df.write.format("delta").mode("overwrite").saveAsTable(FULL)
    print(f"✅ Successfully wrote table: {FULL}")
except Exception as e:
    print(f"⚠️ Write to {FULL} failed: {str(e).splitlines()[0]}")
    # Fallback: save in user folder (always writable)
    try:
        user = spark.sql("SELECT current_user() as u").collect()[0]["u"]
    except Exception:
        user = os.environ.get("USER") or "unknown_user"
    safe_user = user.replace("@", "_at_").replace(" ", "_")
    path = f"/Users/{safe_user}/do_tool/{TBL}_{uuid.uuid4().hex}"
    # df.write.format("delta").mode("overwrite").save(path)
    print(f"✅ Saved Delta files to: {path}")
    print(f"\nIf you want it registered as a shared table, ask an admin to run:\n"
          f"CREATE TABLE {FULL} USING DELTA LOCATION '{path}';")
