In [0]:
import mlflow 

mlflow.set_registry_uri("databricks-uc")
mlflow.set_tracking_uri("databricks")


In [0]:
# SILVER DATA PROCESSING
# Step: Data Cleaning & Structuring

import pyspark.sql.functions as F
from pyspark.sql.types import *
import datetime

BRONZE_PATH = "/Volumes/workspace/sentiment_analysis/bronze"
SILVER_PATH = "/Volumes/workspace/sentiment_analysis/silver"

# Load Bronze data
bronze_df = spark.read.parquet(BRONZE_PATH)
bronze_count = bronze_df.count()

# Type conversions
silver_df = (
    bronze_df
    .withColumn("star_rating", F.col("star_rating").cast(FloatType()))
    .withColumn("helpful_votes", F.col("helpful_votes").cast(IntegerType()))
    .withColumn("total_votes", F.col("total_votes").cast(IntegerType()))
    .withColumn("vine", F.when(F.col("vine") == "Y", F.lit(True)).otherwise(F.lit(False)))
    .withColumn("verified_purchase", F.when(F.col("verified_purchase") == "Y", F.lit(True)).otherwise(F.lit(False)))
    .withColumn("review_date", F.to_date("review_date", "yyyy-MM-dd"))
    .withColumn("ingestion_timestamp", F.to_timestamp("ingestion_timestamp"))
)

# Standardize column names
for c in silver_df.columns:
    silver_df = silver_df.withColumnRenamed(c, c.lower())

# Drop nulls in essential fields
required_fields = ["review_id", "product_id",
                   "customer_id", "star_rating", "review_date", "review_body"]
silver_df = silver_df.dropna(subset=required_fields)

# Keep ratings within valid range
silver_df = silver_df.filter(
    (F.col("star_rating") >= 1) & (F.col("star_rating") <= 5.0))

# Deduplicate
silver_df = silver_df.dropDuplicates(["review_id"])

# Log metrics
silver_count = silver_df.count()
invalid_rows = bronze_count - silver_count
retention_ratio = round(silver_count / bronze_count, 4)

with mlflow.start_run():

    mlflow.log_param("bronze_rows", bronze_count)
    mlflow.log_param("silver_rows", silver_count)
    mlflow.log_metric("invalid_rows", invalid_rows)
    mlflow.log_metric("retention_ratio", retention_ratio)
    mlflow.log_param(
        "process_date", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

    # Save Silver Data
    silver_df.write.mode("overwrite").parquet(SILVER_PATH)


    # Sanity Check
    print(f"Bronze count: {bronze_count}")
    print(f"Silver count: {silver_count}")
    print(f"Retention ratio: {retention_ratio}")
