In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

# Initialize Spark with connection to your cluster and HDFS
spark = SparkSession.builder \
    .appName("Amazon Reviews Topic Modeling") \
    .master("spark://sparkmaster:8080") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()

print("Connected to Spark!")

Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.NullPointerException: Cannot invoke "org.apache.spark.scheduler.TaskScheduler.postStartHook()" because the return value of "org.apache.spark.SparkContext._taskScheduler()" is null
	at org.apache.spark.SparkContext.<init>(SparkContext.scala:694)
	at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:499)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:480)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:238)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)


In [2]:
from pyspark.ml.clustering import LocalLDAModel
from pyspark.ml.feature import CountVectorizerModel
from pyspark.sql.types import ArrayType, StringType, FloatType

# Choose a category to analyze
category = "Arts"

# Load the saved models
lda_model = LocalLDAModel.load(f"hdfs://namenode:9000/user/jovyan/amazon_reviews/models/lda_model_{category}")
vectorizer_model = CountVectorizerModel.load(f"hdfs://namenode:9000/user/jovyan/amazon_reviews/models/vectorizer_{category}")

# Load topic terms
topic_terms = spark.read.parquet(f"hdfs://namenode:9000/user/jovyan/amazon_reviews/results/topics_{category}")

# Display topic terms to understand what each topic represents
topic_terms.select("topic", "terms").show(truncate=False)

# Load original reviews
reviews = spark.read.json(f"hdfs://namenode:9000/user/hadoop/amazon_reviews/data/{category}.filtered.json")

print(f"Loaded {reviews.count()} reviews for {category}")

IllegalArgumentException: requirement failed: Can only call getServletHandlers on a running MetricsSystem

In [None]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover

# Combine review text
reviews = reviews.withColumn(
    "full_text", 
    F.concat_ws(" ", F.col("Review/Summary"), F.col("Review/Text"))
)

# Tokenize
tokenizer = RegexTokenizer(
    inputCol="full_text", 
    outputCol="words", 
    pattern="\\W+"
)
reviews_tokenized = tokenizer.transform(reviews)

# Remove stopwords
remover = StopWordsRemover(
    inputCol="words", 
    outputCol="filtered_words"
)
reviews_filtered = remover.transform(reviews_tokenized)

# Filter short words
filter_short_udf = F.udf(
    lambda words: [word for word in words if len(word) > 2],
    ArrayType(StringType())
)
reviews_filtered = reviews_filtered.withColumn(
    "filtered_words",
    filter_short_udf(F.col("filtered_words"))
)

In [None]:
# Apply the vectorizer model
reviews_vectorized = vectorizer_model.transform(reviews_filtered)

# Apply the LDA model to get topic distributions
reviews_with_topics = lda_model.transform(reviews_vectorized)

# Get the primary topic for each review
def get_primary_topic(distribution):
    return float(distribution.argmax())

get_primary_topic_udf = F.udf(get_primary_topic, FloatType())

reviews_with_topics = reviews_with_topics.withColumn(
    "primary_topic", 
    get_primary_topic_udf(F.col("topicDistribution"))
)

# Verify that topics are assigned
reviews_with_topics.select("ProductID", "Review/Score", "primary_topic").show(5)

In [None]:
# Calculate average rating by topic
topic_ratings = reviews_with_topics.groupBy("primary_topic").agg(
    F.avg("Review/Score").alias("avg_rating"),
    F.count("*").alias("review_count"),
    F.stddev("Review/Score").alias("rating_stddev")
)

# Join with topic terms for better readability
topic_insights = topic_ratings.join(
    topic_terms.select("topic", "terms"), 
    topic_ratings.primary_topic == topic_terms.topic
)

# Show results sorted by rating
print(f"\nTOPIC RATINGS FOR {category}:")
topic_insights.select(
    "primary_topic", 
    "terms", 
    "avg_rating", 
    "review_count"
).orderBy(F.desc("avg_rating")).show(truncate=False)