In [0]:
from datetime import datetime
import json
import hashlib
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StringType, LongType, TimestampType, StructType, StructField
)

# ============================================================
# CONFIGURATION
# ============================================================

RAW_VOLUME_ROOT = "/Volumes/census/raw/raw_files"
MANIFEST_PATH = f"{RAW_VOLUME_ROOT}/manifest.json"

CATALOG = "census"
SCHEMA = "bronze"
TABLE = "file_registry_v1"
FULL_TABLE = f"{CATALOG}.{SCHEMA}.{TABLE}"

QUICK_COUNT_MAX_BYTES = 5 * 1024 * 1024
INGESTION_BATCH_ID = f"reg-{datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')}"

spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

# ============================================================
# HELPERS
# ============================================================

def parse_manifest_row_count(entry):
    for key in ("row_count", "rows", "rowCount"):
        v = entry.get(key)
        if v is None:
            continue
        try:
            return int(v)
        except Exception:
            pass
    return None


def estimate_csv_rows(path, file_size):
    try:
        raw = dbutils.fs.head(path, 1_000_000)
        if not raw:
            return None
        if isinstance(raw, str):
            raw = raw.encode("latin1")
        lines = raw.count(b"\n")
        return int((file_size / len(raw)) * lines) if lines > 0 else None
    except Exception:
        return None


# ============================================================
# LOAD MANIFEST
# ============================================================

def read_manifest(path):
    raw = dbutils.fs.head(path, 10 * 1024 * 1024)
    try:
        return json.loads(raw)
    except Exception:
        rows = spark.read.json(path).collect()
        return rows[0] if len(rows) == 1 else {"parts": rows}


manifest = read_manifest(MANIFEST_PATH)
parts = manifest.get("parts", [])

if not parts:
    raise RuntimeError("Manifest contains no parts")

# ============================================================
# PROCESS FILES
# ============================================================

records = []

for part in parts:
    fname = part.get("filename")
    if not fname:
        continue

    full_path = f"{RAW_VOLUME_ROOT}/{fname}"

    try:
        info = dbutils.fs.ls(full_path)[0]
        size = info.size
        mod_time = datetime.fromtimestamp(info.modificationTime / 1000)
    except Exception:
        continue

    # checksum
    try:
        content = spark.read.format("binaryFile").load(full_path).select("content").first()[0]
        sha256 = hashlib.sha256(content).hexdigest()
        ftype = "parquet" if fname.lower().endswith(".parquet") else "csv"
    except Exception:
        try:
            raw = dbutils.fs.head(full_path, 1024 * 1024)
            sha256 = hashlib.sha256(raw.encode("latin1")).hexdigest()
            ftype = "unknown"
        except Exception:
            sha256 = None
            ftype = "unknown"

    manifest_count = parse_manifest_row_count(part)
    quick_count = None

    if fname.lower().endswith(".csv"):
        if size < QUICK_COUNT_MAX_BYTES:
            try:
                df_tmp = (
                    spark.read
                    .option("header", "true")
                    .option("sep", ";")
                    .option("encoding", "latin1")
                    .csv(full_path)
                )
                quick_count = df_tmp.count()
            except Exception:
                pass
        else:
            quick_count = estimate_csv_rows(full_path, size)

    records.append({
        "filename": fname,
        "filepath": full_path,
        "file_size_bytes": size,
        "modification_time": mod_time,
        "sha256_checksum": sha256,
        "manifest_reported_row_count": manifest_count,
        "quick_count": quick_count,
        "file_type": ftype,
        "generation_seed": manifest.get("seed"),
        "generation_notes": manifest.get("notes"),
        "ingestion_status": "Pending",
        "ingestion_attempts": 0,
        "last_ingestion_timestamp": None,
        "ingestion_batch_id": INGESTION_BATCH_ID,
        "provenance_json": json.dumps({"manifest_entry": part}),
        "created_at": datetime.utcnow(),
        "updated_at": datetime.utcnow(),
    })

# ============================================================
# DATAFRAME CREATION
# ============================================================

schema = StructType([
    StructField("filename", StringType()),
    StructField("filepath", StringType()),
    StructField("file_size_bytes", LongType()),
    StructField("modification_time", TimestampType()),
    StructField("sha256_checksum", StringType()),
    StructField("manifest_reported_row_count", LongType()),
    StructField("quick_count", LongType()),
    StructField("file_type", StringType()),
    StructField("generation_seed", StringType()),
    StructField("generation_notes", StringType()),
    StructField("ingestion_status", StringType()),
    StructField("ingestion_attempts", LongType()),
    StructField("last_ingestion_timestamp", TimestampType()),
    StructField("ingestion_batch_id", StringType()),
    StructField("provenance_json", StringType()),
    StructField("created_at", TimestampType()),
    StructField("updated_at", TimestampType()),
])

df = spark.createDataFrame(records, schema=schema)

# ============================================================
# MERGE INTO DELTA (UNITY CATALOG SAFE)
# ============================================================

spark.sql(f"USE CATALOG {CATALOG}")

df.createOrReplaceTempView("staging_registry")

spark.sql(f"""
MERGE INTO {FULL_TABLE} AS t
USING staging_registry AS s
ON t.filename = s.filename
WHEN MATCHED THEN UPDATE SET
    t.filepath = s.filepath,
    t.file_size_bytes = s.file_size_bytes,
    t.modification_time = s.modification_time,
    t.sha256_checksum = s.sha256_checksum,
    t.manifest_reported_row_count = s.manifest_reported_row_count,
    t.quick_count = s.quick_count,
    t.file_type = s.file_type,
    t.generation_seed = s.generation_seed,
    t.generation_notes = s.generation_notes,
    t.ingestion_status = s.ingestion_status,
    t.ingestion_attempts = t.ingestion_attempts,
    t.last_ingestion_timestamp = t.last_ingestion_timestamp,
    t.ingestion_batch_id = s.ingestion_batch_id,
    t.provenance_json = s.provenance_json,
    t.updated_at = s.updated_at
WHEN NOT MATCHED THEN
  INSERT *
""")

# ============================================================
# WRITE REGISTRATION REPORT 
# ============================================================

report = {
    "ingestion_batch_id": INGESTION_BATCH_ID,
    "timestamp_utc": datetime.utcnow().isoformat(),
    "registered_files": len(records),
    "files": [
        {
            "filename": r["filename"],
            "sha256": r["sha256_checksum"],
            "quick_count": r["quick_count"],
            "manifest_row_count": r["manifest_reported_row_count"]
        }
        for r in records
    ]
}

report_path = f"{RAW_VOLUME_ROOT}/registration_reports/registration_report_{INGESTION_BATCH_ID}.json"
dbutils.fs.put(report_path, json.dumps(report, indent=2), overwrite=True)

print(f"✔ Registration completed: {len(records)} files")
print(f"✔ Report written to: {report_path}")


  INGESTION_BATCH_ID = f"reg-{datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')}"
  "created_at": datetime.utcnow(),
  "updated_at": datetime.utcnow(),
  "timestamp_utc": datetime.utcnow().isoformat(),


Wrote 967 bytes.
✔ Registration completed: 4 files
✔ Report written to: /Volumes/census/raw/raw_files/registration_reports/registration_report_reg-20260106T090523Z.json
