In [1]:
import mlflow
mlflow.set_tracking_uri('file:../mlruns')
mlflow.set_experiment('amazon_book_reviews_local')

  return FileStore(store_uri, store_uri)


<Experiment: artifact_location=('file:///Users/leandrohermann/Library/CloudStorage/OneDrive-Personal/ITBA/Big '
 'Data/tp/itba-bigdata/notebooks/../mlruns/414318220876030780'), creation_time=1762743563365, experiment_id='414318220876030780', last_update_time=1762743563365, lifecycle_stage='active', name='amazon_book_reviews_local', tags={}>

In [18]:
from pyspark.sql import SparkSession


def get_spark(app_name: str = "LocalSparkApp", memory: str = "4g") -> SparkSession:
    spark = (
        SparkSession.builder
        .appName(app_name)
        .master("local[*]")  # run locally on all cores
        .config("spark.sql.shuffle.partitions", "4")
        .config("spark.driver.memory", memory)
        .config("spark.sql.execution.arrow.pyspark.enabled", "true")
        .getOrCreate()
    )

    spark.sparkContext.setLogLevel("WARN")
    return spark

In [22]:
import os
import logging
import mlflow
from pyspark.sql import DataFrame
from pyspark.sql.functions import current_timestamp, lit

logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)s] %(message)s")

SOURCE_PATH = "https://amazon-reviews-pds.s3.amazonaws.com/tsv/amazon_reviews_us_Furniture_v1_00.tsv"
OUTPUT_PATH = "../data/bronze/amazon_reviews_furniture"


def read_source(spark) -> DataFrame:
    logging.info(f"Reading source data from {SOURCE_PATH}")
    df = (
        spark.read
        .option("header", "true")
        .option("sep", "\t")
        .csv("../data/raw/amazon_reviews_us_Furniture_v1_00.tsv")
    )
    return df


def add_metadata(df: DataFrame) -> DataFrame:
    return df.withColumn("ingestion_timestamp", current_timestamp()) \
             .withColumn("source_file", lit(SOURCE_PATH))


def write_bronze(df: DataFrame):
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    df.write.mode("overwrite").parquet(OUTPUT_PATH)
    logging.info(f"✅ Bronze data written to {OUTPUT_PATH}")


def main():
    spark = get_spark("BronzeIngestion")

    with mlflow.start_run(run_name="bronze_ingestion"):
        df = read_source(spark)
        row_count = df.count()
        mlflow.log_metric("rows_read", row_count)

        df = add_metadata(df)
        write_bronze(df)

        mlflow.log_param("source", SOURCE_PATH)
        mlflow.log_param("output", OUTPUT_PATH)
        mlflow.log_metric("columns", len(df.columns))

    spark.stop()


if __name__ == "__main__":
    main()

2025-11-10 00:34:09,570 [INFO] Reading source data from https://amazon-reviews-pds.s3.amazonaws.com/tsv/amazon_reviews_us_Furniture_v1_00.tsv
2025-11-10 00:34:11,534 [INFO] ✅ Bronze data written to ../data/bronze/amazon_reviews_furniture
