In [1]:
from pyspark.sql import SparkSession

# Initialize Spark with HDFS configuration
spark = SparkSession.builder \
    .appName("Amazon Reviews Analysis") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()
# Now you can read data from HDFS
# Replace the path with your actual path where JSON files are stored
df = spark.read.json("hdfs://namenode:9000/user/hadoop/amazon_reviews/data/filtered_data/Arts.filtered.json")

# Verify that data was loaded correctly
print(f"Number of records: {df.count()}")
print("Schema:")
#df.printSchema()

# Show a few sample records
df.show(5, truncate=False)

Number of records: 18313
Schema:
+-------------+-----------------+----------------------------------------------+------------------+-------------------------------+------------+-----------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+--------------+
|product/price|product/productId|product/title                                 |review/helpful

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

# Initialize Spark with connection to your cluster and HDFS
spark = SparkSession.builder \
    .appName("Amazon Reviews Topic Modeling") \
    .master("spark://sparkmaster:8080") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()

print("SparkSession created successfully!")

SparkSession created successfully!


In [3]:
# Adjust path to match your actual location in HDFS
base_path = "hdfs://namenode:9000/user/hadoop/amazon_reviews/data/filtered_data/"

# Get a count of reviews per category to understand data distribution
category_counts = []

# List all categories you have
categories = [
    "Amazon_Instant_Video",
    "Arts",
    "Automotive",
    "Baby",
    "Beauty",
    "Books",
    "Cell_Phones_&_Accessories",
    "Clothing_&_Accessories",
    "Electronics",
    "Gourmet_Foods",
    "Health",
    "Home_&_Kitchen",
    "Industrial_&_Scientific",
    "Jewelry",
    "Kindle_Store",
    "Movies_&_TV",
    "Music",
    "Musical_Instruments",
    "Office_Products",
    "Patio",
    "Pet_Supplies",
    "Shoes",
    "Software",
    "Sports_&_Outdoors",
    "Tools_&_Home_Improvement",
    "Toys_&_Games",
    "Video_Games",
    "Watches"
]

# Count records per category
for category in categories:
    try:
        df = spark.read.json(f"{base_path}{category}.filtered.json")
        count = df.count()
        category_counts.append((category, count))
        print(f"Category: {category}, Records: {count}")
    except:
        print(f"Error loading category: {category}")

# Check total number of reviews
print(f"Total categories loaded: {len(category_counts)}")
print(f"Total reviews: {sum([count for _, count in category_counts])}")

Error loading category: Amazon_Instant_Video
Error loading category: Arts
Error loading category: Automotive
Error loading category: Baby
Error loading category: Beauty
Error loading category: Books
Error loading category: Cell_Phones_&_Accessories
Error loading category: Clothing_&_Accessories
Error loading category: Electronics
Error loading category: Gourmet_Foods
Error loading category: Health
Error loading category: Home_&_Kitchen
Error loading category: Industrial_&_Scientific
Error loading category: Jewelry
Error loading category: Kindle_Store
Error loading category: Movies_&_TV
Error loading category: Music
Error loading category: Musical_Instruments
Error loading category: Office_Products
Error loading category: Patio
Error loading category: Pet_Supplies
Error loading category: Shoes
Error loading category: Software
Error loading category: Sports_&_Outdoors
Error loading category: Tools_&_Home_Improvement
Error loading category: Toys_&_Games
Error loading category: Video_Games

In [11]:
# Function to load all categories
def load_categories(categories, base_path):
    # Start with the first category
    print(f"Loading {categories[0]}...")
    all_df = spark.read.json(f"{base_path}{categories[0]}.filtered.json")
    all_df = all_df.withColumn("category", F.lit(categories[0]))
    
    # Add all other categories with a union
    for category in categories[1:]:
        try:
            print(f"Loading {category}...")
            df = spark.read.json(f"{base_path}{category}.filtered.json")
            df = df.withColumn("category", F.lit(category))
            all_df = all_df.union(df)
        except Exception as e:
            print(f"Error with {category}: {str(e)}")
    
    return all_df

# Load data from all categories (this might take a while)
all_reviews = load_categories(categories, base_path)
print(f"Total reviews loaded: {all_reviews.count()}")

Loading Amazon_Instant_Video...
Loading Arts...
Loading Automotive...
Loading Baby...
Loading Beauty...
Loading Books...
Loading Cell_Phones_&_Accessories...
Loading Clothing_&_Accessories...
Loading Electronics...
Loading Gourmet_Foods...
Loading Health...
Loading Home_&_Kitchen...
Loading Industrial_&_Scientific...
Loading Jewelry...
Loading Kindle_Store...
Loading Movies_&_TV...
Loading Music...
Loading Musical_Instruments...
Loading Office_Products...
Loading Patio...
Loading Pet_Supplies...
Loading Shoes...
Loading Software...
Loading Sports_&_Outdoors...
Loading Tools_&_Home_Improvement...
Loading Toys_&_Games...
Loading Video_Games...
Loading Watches...
Total reviews loaded: 5315161


In [12]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover
from pyspark.sql.types import ArrayType, StringType

# First, combine the review summary and text
all_reviews = all_reviews.withColumn(
    "full_text", 
    F.concat_ws(" ", 
                F.col("Review/Summary"), 
                F.col("Review/Text"))
)

# Step 1: Tokenize the text (split into words)
tokenizer = RegexTokenizer(
    inputCol="full_text", 
    outputCol="words", 
    pattern="\\W+"  # Split on non-word characters
)
reviews_tokenized = tokenizer.transform(all_reviews)

# Step 2: Remove stopwords (common words like "the", "and", etc.)
remover = StopWordsRemover(
    inputCol="words", 
    outputCol="filtered_words"
)
reviews_no_stopwords = remover.transform(reviews_tokenized)

# Step 3: Filter out very short words
filter_short_udf = F.udf(
    lambda words: [word for word in words if len(word) > 2],
    ArrayType(StringType())
)
processed_reviews = reviews_no_stopwords.withColumn(
    "filtered_words",
    filter_short_udf(F.col("filtered_words"))
)

# Check our processing results
processed_reviews.select("category", "full_text", "filtered_words").show(2, truncate=False)

+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|category            |full_text                                                                                                                                                                                                                 |filtered_words                                                                                                                                                                      |
+--------------------+------------------------------------------------------------------------------------------------------------------------------------

In [13]:
from pyspark.ml.feature import CountVectorizer

# Create a CountVectorizer to convert words to vectors
# This counts how many times each word appears
vectorizer = CountVectorizer(
    inputCol="filtered_words", 
    outputCol="features",
    vocabSize=10000,  # Keep the top 10,000 words
    minDF=5           # Word must appear in at least 5 documents
)

# Fit the vectorizer on our data
vectorizer_model = vectorizer.fit(processed_reviews)
vectorized_reviews = vectorizer_model.transform(processed_reviews)

# See what we get
vectorized_reviews.select("category", "filtered_words", "features").show(2)

# Get the vocabulary for later use
vocabulary = vectorizer_model.vocabulary
print(f"Vocabulary size: {len(vocabulary)}")
print(f"Some example words: {vocabulary[:10]}")

+--------------------+--------------------+--------------------+
|            category|      filtered_words|            features|
+--------------------+--------------------+--------------------+
|Amazon_Instant_Video|[jack, wagner, ro...|(10000,[62,91,264...|
|Amazon_Instant_Video|[moving, target, ...|(10000,[5,61,180,...|
+--------------------+--------------------+--------------------+
only showing top 2 rows

Vocabulary size: 10000
Some example words: ['one', 'book', 'great', 'like', 'good', 'well', 'time', 'get', 'really', 'also']


In [14]:
from pyspark.ml.clustering import LDA

# Set the number of topics (this is a key parameter!)
num_topics = 20

# Create the LDA model
lda = LDA(
    k=num_topics,           # Number of topics
    maxIter=20,             # Number of iterations
    featuresCol="features", # Column with our word vectors
    optimizer="em"          # Use expectation-maximization algorithm
)

# Train the model (this might take a while!)
print("Training LDA model...")
lda_model = lda.fit(vectorized_reviews)
print("Model training complete!")

Training LDA model...


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 44660)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
Traceback (most recent call last):
  File "/opt/conda/lib/python3.

ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
from pyspark.sql.types import ArrayType, StringType

# Get topics with their top terms
topics = lda_model.describeTopics(maxTermsPerTopic=15)

# Convert term indices to actual words
def term_indices_to_words(indices):
    return [vocabulary[idx] for idx in indices]

# Create a UDF (User-Defined Function) for this
term_indices_to_words_udf = F.udf(term_indices_to_words, ArrayType(StringType()))

# Apply the UDF to get readable topics
topics_with_words = topics.withColumn(
    "terms", 
    term_indices_to_words_udf(F.col("termIndices"))
)

# Show each topic with its top terms
topics_with_words.select("topic", "terms", "termWeights").show(truncate=False)

In [None]:
from pyspark.sql.types import FloatType

# Apply the model to get topic distributions for each review
reviews_with_topics = lda_model.transform(vectorized_reviews)

# Extract the primary topic for each review
def get_primary_topic(distribution):
    return float(distribution.argmax())

# Create a UDF for this
get_primary_topic_udf = F.udf(get_primary_topic, FloatType())

# Get the primary topic for each review
reviews_with_topics = reviews_with_topics.withColumn(
    "primary_topic", 
    get_primary_topic_udf(F.col("topicDistribution"))
)

# Check distribution of topics
reviews_with_topics.groupBy("primary_topic").count().orderBy("primary_topic").show()

In [None]:
# Get topic distribution by category
topic_by_category = reviews_with_topics.groupBy("category", "primary_topic").count()

# Calculate percentages within each category
category_totals = reviews_with_topics.groupBy("category").count().withColumnRenamed("count", "total")

# Join the counts with totals
topic_percentage = topic_by_category.join(category_totals, on="category")
topic_percentage = topic_percentage.withColumn(
    "percentage", 
    F.round((F.col("count") / F.col("total") * 100), 2)
)

# Show results sorted by category and percentage
topic_percentage.orderBy("category", F.desc("percentage")).show(50)

In [None]:
# Convert to Pandas for easier manipulation
topics_df = topics_with_words.toPandas()

print("TOPIC SUMMARY:")
print("-" * 80)

for i, row in topics_df.iterrows():
    topic_id = row['topic']
    terms = row['terms']
    weights = row['termWeights']
    
    # Sort terms by weight
    term_weights = sorted(zip(terms, weights), key=lambda x: x[1], reverse=True)
    
    # Display top terms with weights
    print(f"Topic {topic_id}:")
    print(", ".join([f"{term} ({weight:.3f})" for term, weight in term_weights[:10]]))
    
    # Show what categories this topic appears in most
    top_categories = topic_percentage.filter(F.col("primary_topic") == topic_id) \
                                      .orderBy(F.desc("percentage")) \
                                      .limit(3)
    
    top_categories_pd = top_categories.toPandas()
    if not top_categories_pd.empty:
        category_info = ", ".join([f"{row['category']} ({row['percentage']}%)" 
                                  for _, row in top_categories_pd.iterrows()])
        print(f"Most common in: {category_info}")
    
    print("-" * 80)

In [None]:
# Save topics for reference
topics_with_words.write.mode("overwrite").parquet(
    "hdfs://namenode:9000/user/hadoop/amazon_reviews/results/lda_topics"
)

# Save review topic assignments
reviews_with_topics.select(
    "ProductID", "category", "Review/Score", "primary_topic", "topicDistribution"
).write.mode("overwrite").parquet(
    "hdfs://namenode:9000/user/hadoop/amazon_reviews/results/review_topics"
)

print("Results saved successfully!")