In [0]:
# <-- replace with your silver dataset path
SILVER_PATH = "/Volumes/workspace/sentiment_analysis/silver"
GOLD_OUT_PATH = "/Volumes/workspace/sentiment_analysis/gold"

REQUIRED_FIELDS = [
    "review_id", "product_id", "customer_id",
    "star_rating", "review_date", "review_body"
]

SENTIMENT_MAP = {
    "negative": [1, 2],
    "neutral": [3],
    "positive": [4, 5]
}

In [0]:
df = spark.read.parquet(SILVER_PATH)
print(
    f"Loaded Silver dataset with {df.count()} rows and {len(df.columns)} columns.")

df.printSchema()
df.show(5)

In [0]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

missing = [c for c in REQUIRED_FIELDS if c not in df.columns]
if missing:
    raise ValueError(f"Missing required fields: {missing}")

# Filter invalid or incomplete rows
df = df.filter(
    F.col("review_body").isNotNull() &
    (F.length(F.trim(F.col("review_body"))) > 5) &
    (F.col("star_rating").isNotNull())
)
df = df.withColumn("star_rating", F.col("star_rating").cast(T.IntegerType()))
df = df.filter(F.col("star_rating").between(1, 5))

print(f"After filtering: {df.count()} rows remain.")

In [0]:
def basic_text_cleaning(df, text_col="review_body", out_col="clean_text"):
    expr = F.col(text_col)
    expr = F.lower(F.regexp_replace(expr, r"\s+", " "))
    expr = F.regexp_replace(expr, r"<[^>]+>", '')
    expr = F.regexp_replace(expr, r"[“”«»„‟]", '"')
    df = df.withColumn(out_col, expr)
    df = df.withColumn("n_chars", F.length(F.col(out_col)))
    df = df.withColumn("n_words", F.size(F.split(F.col(out_col), " ")))
    return df


df = basic_text_cleaning(df)

df.select("clean_text", "n_words").show(5, truncate=80)

In [0]:
def map_ratings_to_labels(df, rating_col="star_rating", out_col="sentiment_label"):
    mapping = SENTIMENT_MAP
    expr = F.when(F.col(rating_col).isin(mapping["negative"]), F.lit("negative")) \
        .when(F.col(rating_col).isin(mapping["neutral"]), F.lit("neutral")) \
        .when(F.col(rating_col).isin(mapping["positive"]), F.lit("positive")) \
        .otherwise(F.lit("neutral"))
    return df.withColumn(out_col, expr)


df = map_ratings_to_labels(df)

# Quick check
df.groupBy("sentiment_label").count().show()

In [0]:
from pyspark.sql.window import Window

product_window = Window.partitionBy(
    "product_id").orderBy(F.col("review_date").desc())
user_window = Window.partitionBy("user_id").orderBy("review_date")

df = df.withColumn("review_rank", F.rank().over(product_window))
df = df.withColumn("avg_sentiment_product", F.avg(
    "sentiment_label").over(product_window.rowsBetween(-5, 0)))

In [0]:
selected_cols = [
    "review_id", "product_id", "customer_id",
    "clean_text", "sentiment_label", "star_rating",
    "review_date", "helpful_votes", "total_votes", "verified_purchase"
]
selected_cols = [c for c in selected_cols if c in df.columns]

df_gold = df.select(*selected_cols)

df_gold.show(5, truncate=100)

In [0]:

df_gold.write.mode("overwrite").parquet(GOLD_OUT_PATH)

# Metadata
created_at = df_gold.select(F.current_timestamp().alias(
    "created_at")).first()["created_at"]

meta = {
    # convert to string if you plan to JSON dump
    "created_at": str(created_at),
    "total_rows": df_gold.count(),
    "label_distribution": {
        r["sentiment_label"]: r["count"]
        for r in df_gold.groupBy("sentiment_label").count().collect()
    }
}

print(f"Gold sentiment dataset saved to {GOLD_OUT_PATH}")

In [0]:
import mlflow
import json

mlflow.set_registry_uri("databricks-uc")
mlflow.set_tracking_uri("databricks")

with mlflow.start_run():
    mlflow.log_param("total_rows", df_gold.count())
    mlflow.log_param("n_columns", len(df_gold.columns))
    mlflow.log_param("columns", ", ".join(df_gold.columns))
    # Label distribution
    label_dist = df_gold.groupBy("sentiment_label").count().collect()
    total = sum([r["count"] for r in label_dist])

    for r in label_dist:
        label = r["sentiment_label"]
        ratio = r["count"] / total
        mlflow.log_metric(f"label_ratio_{label}", ratio)

    # Average review length
    from pyspark.sql import functions as F
    avg_length = df_gold.select(F.avg(F.length("clean_text"))).first()[0]
    mlflow.log_metric("avg_review_length", float(avg_length))

    # Missing data ratio (quick data health check)
    missing_ratios = {
        col: df_gold.filter(F.col(col).isNull()).count() / total
        for col in df_gold.columns
    }
    for col, ratio in missing_ratios.items():
        mlflow.log_metric(f"missing_ratio_{col}", ratio)

    completeness_score = 1 - \
        sum(missing_ratios.values()) / len(df_gold.columns)
    balance_score = 1 - \
        max(abs(r["count"] - total / len(label_dist)) /
            total for r in label_dist)

    mlflow.log_dict("metadata", json.dumps(meta))
    mlflow.log_metric("completeness_score", completeness_score)
    mlflow.log_metric("balance_score", balance_score)