In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

# Initialize Spark with connection to your cluster and HDFS
spark = SparkSession.builder \
    .appName("Amazon Reviews Topic Modeling") \
    .master("spark://sparkmaster:8080") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()

print("Connected to Spark!")

In [None]:
from pyspark.ml.clustering import LDAModel
from pyspark.ml.feature import CountVectorizerModel
from pyspark.sql.types import ArrayType, StringType, FloatType

# Choose a category to analyze
category = "Electronics"

# Load the saved models
lda_model = LDAModel.load(f"hdfs://namenode:9000/user/jovyan/amazon_reviews/models/lda_model_{category}")
vectorizer_model = CountVectorizerModel.load(f"hdfs://namenode:9000/user/jovyan/amazon_reviews/models/vectorizer_{category}")

# Load topic terms
topic_terms = spark.read.parquet(f"hdfs://namenode:9000/user/jovyan/amazon_reviews/results/topics_{category}")

# Display topic terms to understand what each topic represents
topic_terms.select("topic", "terms").show(truncate=False)

# Load original reviews
reviews = spark.read.json(f"hdfs://namenode:9000/user/hadoop/amazon_reviews/data/{category}.filtered.json")

print(f"Loaded {reviews.count()} reviews for {category}")

In [None]:
from pyspark.ml.clustering import LDAModel
from pyspark.ml.feature import CountVectorizerModel

# You'll need to recreate your preprocessing pipeline
# ... tokenization, stopwords, etc. (same steps as before)
# ... until you get vectorized_reviews



# 2. Load saved models
lda_model = LDAModel.load(f"hdfs://namenode:9000/user/jovyan/amazon_reviews/models/lda_model_{category}")


# 3. Load topic terms
topic_terms = spark.read.parquet(f"hdfs://namenode:9000/user/jovyan/amazon_reviews/results/topics_{category}")



# Apply model to get topic distributions
reviews_with_topics = lda_model.transform(vectorized_reviews)

# Add primary topic (the topic with highest probability)
def get_primary_topic(distribution):
    return float(distribution.argmax())

get_primary_topic_udf = F.udf(get_primary_topic, FloatType())
reviews_with_topics = reviews_with_topics.withColumn(
    "primary_topic", 
    get_primary_topic_udf(F.col("topicDistribution"))
)

In [None]:
# Calculate rating statistics by topic
topic_ratings = reviews_with_topics.filter(col("primary_topic").isNotNull()) \
                                  .groupBy("primary_topic") \
                                  .agg(
                                      F.avg("Review/Score").alias("avg_rating"),
                                      F.count("*").alias("review_count"),
                                      F.stddev("Review/Score").alias("rating_stddev")
                                  )

# Join with topic terms
topic_insights = topic_ratings.join(
    topic_terms,
    topic_ratings.primary_topic == topic_terms.topic
)

# Display results
topic_insights.select(
    "primary_topic",
    "terms",
    "avg_rating",
    "review_count"
).orderBy("avg_rating").show(truncate=False)