In [0]:
from pyspark.sql.functions import *

# Load CSV
df = spark.read.option("header", True).csv("dbfs:/FileStore/reviews.csv")

# Clean text
df = df.withColumn("clean_review", lower(regexp_replace("review_text", "[^a-zA-Z0-9 ]", "")))

# Count number of words
df = df.withColumn("review_length", size(split("clean_review", " ")))

# Filter bad reviews
df_clean = df.filter((col("rating") >= 3) & (col("review_length") >= 3))

# Aggregations
top_products = df_clean.groupBy("product_id").agg(count("*").alias("review_count"), avg("rating").alias("avg_rating"))
top_users = df_clean.groupBy("user_id").count().orderBy(desc("count"))

# Save output
df_clean.write.format("delta").mode("overwrite").save("dbfs:/mnt/data/clean_reviews")
