In [1]:
import mlflow
mlflow.set_tracking_uri('file:../mlruns')
mlflow.set_experiment('amazon_books_reviews_local')

  return FileStore(store_uri, store_uri)


<Experiment: artifact_location=('file:///Users/leandrohermann/Library/CloudStorage/OneDrive-Personal/ITBA/Big '
 'Data/tp/itba-bigdata/notebooks/../mlruns/840435828919065229'), creation_time=1762810709216, experiment_id='840435828919065229', last_update_time=1762810709216, lifecycle_stage='active', name='amazon_books_reviews_local', tags={}>

In [5]:
from pyspark.sql import SparkSession


def get_spark(app_name: str = "LocalSparkApp", memory: str = "4g") -> SparkSession:
    spark = (
        SparkSession.builder
        .appName(app_name)
        .master("local[*]")  # run locally on all cores
        .config("spark.sql.shuffle.partitions", "4")
        .config("spark.driver.memory", memory)
        .config("spark.sql.execution.arrow.pyspark.enabled", "true")
        .getOrCreate()
    )

    spark.sparkContext.setLogLevel("WARN")
    return spark

In [6]:
import os
import logging
import mlflow
from pyspark.sql import DataFrame
from pyspark.sql.functions import current_timestamp, lit

logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)s] %(message)s")

# SOURCE_PATH = "https://amazon-reviews-pds.s3.amazonaws.com/tsv/amazon_reviews_us_Furniture_v1_00.tsv"
SOURCE_PATH = "../data/raw/amazon_reviews_us_Furniture_v1_00.tsv"
OUTPUT_PATH = "../data/bronze/amazon_reviews_furniture"


def read_source(spark) -> DataFrame:
    logging.info(f"Reading source data from {SOURCE_PATH}")
    df = (
        spark.read
        .option("header", "true")
        .option("sep", "\t")
        .csv(SOURCE_PATH)
    )
    return df


def add_metadata(df: DataFrame) -> DataFrame:
    return df.withColumn("ingestion_timestamp", current_timestamp()) \
             .withColumn("source_file", lit(SOURCE_PATH))


def write_bronze(df: DataFrame):
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    df.write.mode("overwrite").parquet(OUTPUT_PATH)
    logging.info(f"Bronze data written to {OUTPUT_PATH}")


def main():
    spark = get_spark("BronzeIngestion")

    with mlflow.start_run(run_name="bronze_ingestion"):
        df = read_source(spark)
        row_count = df.count()
        mlflow.log_metric("rows_read", row_count)

        df = add_metadata(df)
        write_bronze(df)

        mlflow.log_param("source", SOURCE_PATH)
        mlflow.log_param("output", OUTPUT_PATH)
        mlflow.log_metric("columns", len(df.columns))

    spark.stop()


if __name__ == "__main__":
    main()

2025-11-12 07:52:49,386 [INFO] Reading source data from ../data/raw/amazon_reviews_us_Furniture_v1_00.tsv
2025-11-12 07:52:51,218 [INFO] Bronze data written to ../data/bronze/amazon_reviews_furniture


In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC ## ðŸ§± Bronze Layer Validation & Logging
# MAGIC This notebook validates the ingested Bronze data and logs profiling metrics to MLflow.
# MAGIC
# MAGIC **Checks performed:**
# MAGIC 1. Schema consistency
# MAGIC 2. Row count and column count
# MAGIC 3. Nulls and duplicates
# MAGIC 4. Key stats (distinct products, date range)
# MAGIC 5. MLflow logging of results

# COMMAND ----------
import os
import mlflow
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

# === CONFIGURATION ===
BRONZE_PATH = "../data/bronze/amazon_reviews_furniture/part-00000-*.parquet"
EXPERIMENT_NAME = "bronze_validation"
MLFLOW_URI = "file:../mlruns"

# Initialize Spark session
spark = get_spark("BronzeValidation")
spark.sparkContext.setLogLevel("WARN")

# Initialize MLflow
mlflow.set_tracking_uri(MLFLOW_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

# COMMAND ----------
# === Load Bronze Data ===
print(f"Reading Bronze data from: {BRONZE_PATH}")
df = spark.read.parquet(BRONZE_PATH)
print(f"Loaded {df.count():,} rows, {len(df.columns)} columns")

# COMMAND ----------
# === Schema Validation ===
print("Schema:")
df.printSchema()

# Expected key columns (adapt as needed)
expected_columns = ["marketplace", "customer_id",
                    "review_id", "product_id", "product_title", "star_rating"]

missing_cols = [c for c in expected_columns if c not in df.columns]
if missing_cols:
    print(f"Missing columns: {missing_cols}")
else:
    print("All expected columns present.")

# COMMAND ----------
# === Basic Metrics ===
metrics = {}

metrics["row_count"] = df.count()
metrics["column_count"] = len(df.columns)
metrics["distinct_reviews"] = df.select(
    F.countDistinct("review_id")).first()[0]
metrics["distinct_products"] = df.select(
    F.countDistinct("product_id")).first()[0]

if "review_date" in df.columns:
    date_summary = df.select(
        F.min("review_date").alias("min_date"),
        F.max("review_date").alias("max_date")
    ).first()
    metrics["min_review_date"] = date_summary["min_date"]
    metrics["max_review_date"] = date_summary["max_date"]

metrics

# COMMAND ----------
# === Null analysis ===
null_counts = (
    df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c)
              for c in df.columns])
)
print("Null counts:")
null_counts.show(5, truncate=False)

# COMMAND ----------
# === Duplicate detection ===
if "review_id" in df.columns:
    dup_count = df.groupBy("review_id").count().filter("count > 1").count()
    metrics["duplicate_reviews"] = dup_count
    print(f"Duplicated review_id count: {dup_count}")

# COMMAND ----------
# === Log results to MLflow ===
with mlflow.start_run(run_name="bronze_validation_run"):
    mlflow.log_param("min_review_date", str(
        metrics.pop("min_review_date", None)))
    mlflow.log_param("max_review_date", str(
        metrics.pop("max_review_date", None)))
    for k, v in metrics.items():
        if isinstance(v, (int, float)):
            mlflow.log_metric(k, v)
    mlflow.log_param("bronze_path", BRONZE_PATH)
    mlflow.log_param("columns", ",".join(df.columns))
    # Save schema as artifact
    schema_path = "../data/bronze/schema.json"
    os.makedirs(os.path.dirname(schema_path), exist_ok=True)
    with open(schema_path, "w") as f:
        f.write(df.schema.json())
    mlflow.log_artifact(schema_path)

print("Metrics and schema logged to MLflow")

# COMMAND ----------
# === Summary output ===
print("=== Summary ===")
for k, v in metrics.items():
    print(f"{k:25s}: {v}")

print("\nOpen MLflow UI to review: http://127.0.0.1:5000")
spark.stop()

ðŸ”¹ Reading Bronze data from: ../data/bronze/amazon_reviews_furniture/part-00000-*.parquet
Loaded 87,373 rows, 17 columns
Schema:
root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: string (nullable = true)
 |-- total_votes: string (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)
 |-- source_file: string (nullable = true)

All expected columns present.
ðŸ”¹ Null counts:
+-----------+-----------+---------+----------+--------------+