In [1]:
from pyspark.sql import SparkSession


def get_spark(app_name: str = "LocalSparkApp", memory: str = "4g") -> SparkSession:
    spark = (
        SparkSession.builder
        .appName(app_name)
        .master("local[*]")  # run locally on all cores
        .config("spark.sql.shuffle.partitions", "4")
        .config("spark.driver.memory", memory)
        .config("spark.sql.execution.arrow.pyspark.enabled", "true")
        .getOrCreate()
    )

    spark.sparkContext.setLogLevel("WARN")
    return spark

In [2]:
# SILVER DATA PROCESSING
# Step: Data Cleaning & Structuring

import os
import pyspark.sql.functions as F
from pyspark.sql.types import *
import mlflow
import datetime

BRONZE_PATH = "../data/bronze/amazon_reviews_furniture/"
SILVER_PATH = "../data/silver/amazon_reviews_furniture/"
LOG_PATH = "../data/logs/amazon_reviews_furniture/"
MLFLOW_URI = "file:../mlruns"
EXPERIMENT_NAME = "silver_processing"

spark = (
    get_spark("SilverProcessing")
)

mlflow.set_tracking_uri(MLFLOW_URI)
mlflow.set_experiment(EXPERIMENT_NAME)


# Load Bronze data
bronze_df = spark.read.parquet(BRONZE_PATH)
bronze_count = bronze_df.count()

# Type conversions
silver_df = (
    bronze_df
    .withColumn("star_rating", F.col("star_rating").cast(FloatType()))
    .withColumn("helpful_votes", F.col("helpful_votes").cast(IntegerType()))
    .withColumn("total_votes", F.col("total_votes").cast(IntegerType()))
    .withColumn("vine", F.when(F.col("vine") == "Y", F.lit(True)).otherwise(F.lit(False)))
    .withColumn("verified_purchase", F.when(F.col("verified_purchase") == "Y", F.lit(True)).otherwise(F.lit(False)))
    .withColumn("review_date", F.to_date("review_date", "yyyy-MM-dd"))
    .withColumn("ingestion_timestamp", F.to_timestamp("ingestion_timestamp"))
)

# Standardize column names (if needed)
for c in silver_df.columns:
    silver_df = silver_df.withColumnRenamed(c, c.lower())

# Drop nulls in essential fields
required_fields = ["review_id", "product_id",
                   "customer_id", "star_rating", "review_date", "review_body"]
silver_df = silver_df.dropna(subset=required_fields)

# Keep ratings within valid range
silver_df = silver_df.filter(
    (F.col("star_rating") >= 1.0) & (F.col("star_rating") <= 5.0))

# Deduplicate
silver_df = silver_df.dropDuplicates(["review_id"])

# Log metrics
silver_count = silver_df.count()
invalid_rows = bronze_count - silver_count
retention_ratio = round(silver_count / bronze_count, 4)

with mlflow.start_run(run_name=EXPERIMENT_NAME):

    mlflow.log_param("bronze_rows", bronze_count)
    mlflow.log_param("silver_rows", silver_count)
    mlflow.log_metric("invalid_rows", invalid_rows)
    mlflow.log_metric("retention_ratio", retention_ratio)
    mlflow.log_param(
        "process_date", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

    # Save Silver Data
    os.makedirs(os.path.dirname(SILVER_PATH), exist_ok=True)
    silver_df.write.mode("overwrite").parquet(SILVER_PATH)

    os.makedirs(os.path.dirname(LOG_PATH), exist_ok=True)

    mlflow.log_artifact(LOG_PATH)

    # Sanity Check
    print(f"Bronze count: {bronze_count}")
    print(f"Silver count: {silver_count}")
    print(f"Retention ratio: {retention_ratio}")

spark.stop()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/13 23:22:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/13 23:22:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/11/13 23:22:04 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
  return FileStore(store_uri, store_uri)
25/11/13 23:22:08 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Bronze count: 792113
Silver count: 791971
Retention ratio: 0.9998
