In [0]:
import mlflow 

mlflow.set_registry_uri("databricks-uc")
mlflow.set_tracking_uri("databricks")


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

schema = StructType([
    StructField("marketplace", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("review_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("product_parent", StringType(), True),
    StructField("product_title", StringType(), True),
    StructField("product_category", StringType(), True),
    StructField("star_rating", IntegerType(), True),
    StructField("helpful_votes", IntegerType(), True),
    StructField("total_votes", IntegerType(), True),
    StructField("vine", StringType(), True),
    StructField("verified_purchase", StringType(), True),
    StructField("review_headline", StringType(), True),
    StructField("review_body", StringType(), True),
    StructField("review_date", DateType(), True)
])

In [0]:
import logging
from pyspark.sql import DataFrame
from pyspark.sql.functions import current_timestamp, lit

logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)s] %(message)s")

SOURCE_PATH = "/Volumes/workspace/sentiment_analysis/raw"
BRONZE_PATH = "/Volumes/workspace/sentiment_analysis/bronze"


def read_source(spark) -> DataFrame:
    logging.info(f"Reading source data from {SOURCE_PATH}")
    df = (
        spark.read
        .option("header", "true")
        .option("sep", "\t")
        .schema(schema)
        .csv(SOURCE_PATH)
    )
    return df


def add_metadata(df: DataFrame) -> DataFrame:
    return df.withColumn("ingestion_timestamp", current_timestamp()) \
             .withColumn("source_file", lit(SOURCE_PATH))


def write_bronze(df: DataFrame):
    df.write.mode("overwrite").parquet(BRONZE_PATH)
    logging.info(f"Bronze data written to {BRONZE_PATH}")


def main():
    with mlflow.start_run() as run:
        df = read_source(spark)
        row_count = df.count()
        mlflow.log_metric("rows_read", row_count)

        df = add_metadata(df)
        df.show(5)
        write_bronze(df)

        mlflow.log_param("source", SOURCE_PATH)
        mlflow.log_param("output", BRONZE_PATH)
        mlflow.log_metric("columns", len(df.columns))

main()

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC ## ðŸ§± Bronze Layer Validation & Logging
# MAGIC This notebook validates the ingested Bronze data and logs profiling metrics to MLflow.
# MAGIC
# MAGIC **Checks performed:**
# MAGIC 1. Schema consistency
# MAGIC 2. Row count and column count
# MAGIC 3. Nulls and duplicates
# MAGIC 4. Key stats (distinct products, date range)
# MAGIC 5. MLflow logging of results

# COMMAND ----------
import os
import mlflow
import pyspark.sql.functions as F

# === CONFIGURATION ===
EXPERIMENT_NAME = "bronze_validation"
mlflow.set_experiment(f'/Users/nmoccagatta@itba.edu.ar/{EXPERIMENT_NAME}')

# COMMAND ----------
# === Load Bronze Data ===
print(f"Reading Bronze data from: {BRONZE_PATH}")
df = spark.read.parquet(BRONZE_PATH)
print(f"Loaded {df.count():,} rows, {len(df.columns)} columns")

# COMMAND ----------
# === Schema Validation ===
print("Schema:")
df.printSchema()

expected_columns = ['marketplace',
                    'customer_id',
                    'review_id',
                    'product_id',
                    'product_parent',
                    'product_title',
                    'product_category',
                    'star_rating',
                    'helpful_votes',
                    'total_votes',
                    'vine',
                    'verified_purchase',
                    'review_headline',
                    'review_body',
                    'review_date',
                    'ingestion_timestamp',
                    'source_file']

missing_cols = [c for c in expected_columns if c not in df.columns]
if missing_cols:
    print(f"Missing columns: {missing_cols}")
else:
    print("All expected columns present.")

# COMMAND ----------
# === Basic Metrics ===
metrics = {}

metrics["row_count"] = df.count()
metrics["column_count"] = len(df.columns)
metrics["distinct_reviews"] = df.select(
    F.countDistinct("review_id")).first()[0]
metrics["distinct_products"] = df.select(
    F.countDistinct("product_id")).first()[0]

if "review_date" in df.columns:
    date_summary = df.select(
        F.min("review_date").alias("min_date"),
        F.max("review_date").alias("max_date")
    ).first()
    metrics["min_review_date"] = date_summary["min_date"]
    metrics["max_review_date"] = date_summary["max_date"]

metrics

# COMMAND ----------
# === Null analysis ===
null_counts = (
    df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c)
              for c in df.columns])
)
print("Null counts:")
null_counts.show(5, truncate=False)

# COMMAND ----------
# === Duplicate detection ===
if "review_id" in df.columns:
    dup_count = df.groupBy("review_id").count().filter("count > 1").count()
    metrics["duplicate_reviews"] = dup_count
    print(f"Duplicated review_id count: {dup_count}")

# COMMAND ----------
# === Log results to MLflow ===
with mlflow.start_run(run_name="bronze_validation_run"):
    mlflow.log_param("min_review_date", str(
        metrics.pop("min_review_date", None)))
    mlflow.log_param("max_review_date", str(
        metrics.pop("max_review_date", None)))
    for k, v in metrics.items():
        if isinstance(v, (int, float)):
            mlflow.log_metric(k, v)
    mlflow.log_param("bronze_path", BRONZE_PATH)
    mlflow.log_param("columns", ",".join(df.columns))
    # Save schema as artifact
    schema_path = "../data/bronze/schema.json"
    os.makedirs(os.path.dirname(schema_path), exist_ok=True)
    with open(schema_path, "w") as f:
        f.write(df.schema.json())
    mlflow.log_artifact(schema_path)

print("Metrics and schema logged to MLflow")

# COMMAND ----------
# === Summary output ===
print("=== Summary ===")
for k, v in metrics.items():
    print(f"{k:25s}: {v}")
