In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

# Initialize Spark with connection to your cluster and HDFS
spark = SparkSession.builder \
    .appName("Amazon Reviews Topic Modeling") \
    .master("spark://sparkmaster:8080") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()

print("Connected to Spark!")

In [None]:
from pyspark.ml.clustering import LDAModel
from pyspark.ml.feature import CountVectorizerModel
from pyspark.sql.types import ArrayType, StringType, FloatType

# Choose a category to analyze
category = "Electronics"

# Load the saved models
lda_model = LDAModel.load(f"hdfs://namenode:9000/user/jovyan/amazon_reviews/models/lda_model_{category}")
vectorizer_model = CountVectorizerModel.load(f"hdfs://namenode:9000/user/jovyan/amazon_reviews/models/vectorizer_{category}")

# Load topic terms
topic_terms = spark.read.parquet(f"hdfs://namenode:9000/user/jovyan/amazon_reviews/results/topics_{category}")

# Display topic terms to understand what each topic represents
topic_terms.select("topic", "terms").show(truncate=False)

# Load original reviews
reviews = spark.read.json(f"hdfs://namenode:9000/user/hadoop/amazon_reviews/data/{category}.filtered.json")

print(f"Loaded {reviews.count()} reviews for {category}")

In [None]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover

# Combine review text
reviews = reviews.withColumn(
    "full_text", 
    F.concat_ws(" ", F.col("Review/Summary"), F.col("Review/Text"))
)

# Tokenize
tokenizer = RegexTokenizer(
    inputCol="full_text", 
    outputCol="words", 
    pattern="\\W+"
)
reviews_tokenized = tokenizer.transform(reviews)

# Remove stopwords
remover = StopWordsRemover(
    inputCol="words", 
    outputCol="filtered_words"
)
reviews_filtered = remover.transform(reviews_tokenized)

# Filter short words
filter_short_udf = F.udf(
    lambda words: [word for word in words if len(word) > 2],
    ArrayType(StringType())
)
reviews_filtered = reviews_filtered.withColumn(
    "filtered_words",
    filter_short_udf(F.col("filtered_words"))
)

In [None]:
# Apply the vectorizer model
reviews_vectorized = vectorizer_model.transform(reviews_filtered)

# Apply the LDA model to get topic distributions
reviews_with_topics = lda_model.transform(reviews_vectorized)

# Get the primary topic for each review
def get_primary_topic(distribution):
    return float(distribution.argmax())

get_primary_topic_udf = F.udf(get_primary_topic, FloatType())

reviews_with_topics = reviews_with_topics.withColumn(
    "primary_topic", 
    get_primary_topic_udf(F.col("topicDistribution"))
)

# Verify that topics are assigned
reviews_with_topics.select("ProductID", "Review/Score", "primary_topic").show(5)

In [None]:
# Calculate average rating by topic
topic_ratings = reviews_with_topics.groupBy("primary_topic").agg(
    F.avg("Review/Score").alias("avg_rating"),
    F.count("*").alias("review_count"),
    F.stddev("Review/Score").alias("rating_stddev")
)

# Join with topic terms for better readability
topic_insights = topic_ratings.join(
    topic_terms.select("topic", "terms"), 
    topic_ratings.primary_topic == topic_terms.topic
)

# Show results sorted by rating
print(f"\nTOPIC RATINGS FOR {category}:")
topic_insights.select(
    "primary_topic", 
    "terms", 
    "avg_rating", 
    "review_count"
).orderBy(F.desc("avg_rating")).show(truncate=False)