In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF, ChiSqSelector, Normalizer, StringIndexer
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

import numpy as np
import pandas as pd
import random

# Set a fixed random seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)

# Initialize SparkSession with optimized configuration
spark = SparkSession.builder \
    .appName("DIC EX 2_2 - group 36") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "1g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.default.parallelism", "200") \
    .config("spark.rdd.compress", "true") \
    .config("spark.logLevel", "ERROR") \
    .getOrCreate()

# Set log level to reduce warnings
spark.sparkContext.setLogLevel("ERROR")

print("Spark session initialized with optimized configuration")



Spark session initialized with optimized configuration


In [None]:
# Load the development dataset
print("Loading review data...")
data_path = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"
df = spark.read.json(data_path)

# Cache the DataFrame to improve performance of multiple operations
df = df.cache()

# Display schema and sample data
print("Dataset Schema:")
df.printSchema()
print("\nSample data:")
df.select("reviewText", "category").show(5, truncate=True)

# Function to check dataset statistics
def print_stats(df):
    total_reviews = df.count()
    category_counts = df.groupBy("category").count().orderBy("count", ascending=False)
    
    print(f"Total number of reviews: {total_reviews}")
    print("Category distribution:")
    category_counts.show(20, truncate=False)
    
    return category_counts

# Get dataset statistics
category_counts = print_stats(df)

# Convert category_counts to Pandas for visualization but limit conversion size
category_dist = category_counts.limit(10).toPandas()

# Data Preparation: Split into training, validation, and test sets
print("Splitting data into training, validation, and test sets...")
train_data, temp_data = df.randomSplit([0.7, 0.3], seed=seed)
validation_data, test_data = temp_data.randomSplit([0.5, 0.5], seed=seed)

# Cache the training data since we'll use it multiple times
train_data = train_data.cache()
validation_data = validation_data.cache()
test_data = test_data.cache()

print(f"Training set size: {train_data.count()}")
print(f"Validation set size: {validation_data.count()}")
print(f"Test set size: {test_data.count()}")



Loading review data...
Dataset Schema:
root
 |-- asin: string (nullable = true)
 |-- category: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)


Sample data:
+--------------------+--------------------+
|          reviewText|            category|
+--------------------+--------------------+
|This was a gift f...|Patio_Lawn_and_Garde|
|This is a very ni...|Patio_Lawn_and_Garde|
|The metal base wi...|Patio_Lawn_and_Garde|
|For the most part...|Patio_Lawn_and_Garde|
|This hose is supp...|Patio_Lawn_and_Garde|
+--------------------+--------------------+
only showing top 5 rows



                                                                                

Total number of reviews: 78829
Category distribution:


                                                                                

+--------------------------+-----+
|category                  |count|
+--------------------------+-----+
|Book                      |22507|
|Electronic                |7825 |
|Clothing_Shoes_and_Jewelry|5749 |
|Movies_and_TV             |4607 |
|Home_and_Kitche           |4254 |
|CDs_and_Vinyl             |3749 |
|Cell_Phones_and_Accessorie|3447 |
|Sports_and_Outdoor        |3269 |
|Kindle_Store              |3205 |
|Health_and_Personal_Care  |2982 |
|Apps_for_Android          |2638 |
|Toys_and_Game             |2253 |
|Beauty                    |2023 |
|Tools_and_Home_Improvement|1926 |
|Automotive                |1374 |
|Grocery_and_Gourmet_Food  |1297 |
|Office_Product            |1243 |
|Pet_Supplie               |1235 |
|Patio_Lawn_and_Garde      |994  |
|Baby                      |916  |
+--------------------------+-----+
only showing top 20 rows



                                                                                

Splitting data into training, validation, and test sets...
Training set size: 55332
Validation set size: 11805
Test set size: 11692
Loading selected features from output_ds.txt...
Loaded 75 features from output_ds.txt
Sample features: ['amazon', 'author', 'back', 'bad', 'big', 'bit', 'bought', 'buy', 'character', 'characters']
Building the ML Pipeline for classification...
Setting up parameter grid for SVM optimization...

Training model with 2000 features...
This may take some time...


[Stage 11924:(11 + 1) / 14][Stage 11926:(5 + 0) / 14][Stage 11928:(5 + 0) / 14] ]]

KeyboardInterrupt: 

In [None]:
# Load the top features selected in Part 2
print("Loading selected features from output_ds.txt...")
# Broadcast the selected features to reduce serialization overhead
with open("output_ds.txt", "r") as f:
    selected_features = f.read().strip().split()

# Broadcast the selected features
broadcast_features = spark.sparkContext.broadcast(selected_features)

print(f"Loaded {len(selected_features)} features from output_ds.txt")
print(f"Sample features: {selected_features[:10]}")

# Efficiently load stopwords once and broadcast them
def load_stopwords(path: str) -> list[str]:
    with open(path, "r", encoding="utf-8") as f:
        stopwords = set(line.strip() for line in f if line.strip())
    return list(stopwords)

# Load stopwords once
stopwords_list = load_stopwords("stopwords.txt")
# Broadcast stopwords to all executors
broadcast_stopwords = spark.sparkContext.broadcast(stopwords_list)

print("Building the ML Pipeline for classification...")

# 1. Convert category to numeric labels
label_indexer = StringIndexer(inputCol="category", outputCol="label")

# 2. Text preprocessing with optimized tokenizer
tokenizer = RegexTokenizer(
    inputCol="reviewText", 
    outputCol="tokens", 
    pattern="[\\s\\t\\d\\(\\)\\[\\]\\{\\}\\.\\!\\?\\,\\;\\:\\+\\=\\-\\_\\\"\\'`\\~\\#\\@\\&\\*\\%\\€\\$\\§\\\\\\/]+"
)

# 3. Remove stopwords using broadcast variable
stopwords_remover = StopWordsRemover(
    inputCol="tokens", 
    outputCol="tokens_filtered", 
    stopWords=broadcast_stopwords.value
)

# 4. Create term frequency vectors with limited vocabulary size to reduce dimensionality
count_vectorizer = CountVectorizer(
    inputCol="tokens_filtered", 
    outputCol="tf",
    vocabSize=20000  # Limit vocabulary size
)

# 5. Calculate IDF
idf = IDF(inputCol="tf", outputCol="tf_idf")

# 6. Feature selection using Chi-Square - reduced dimensionality
chi_sq_selector_2000 = ChiSqSelector(
    numTopFeatures=2000, 
    featuresCol="tf_idf", 
    outputCol="selected_features",
    labelCol="label"
)

chi_sq_selector_500 = ChiSqSelector(
    numTopFeatures=500, 
    featuresCol="tf_idf", 
    outputCol="selected_features",
    labelCol="label"
)

# 7. Vector normalization
normalizer = Normalizer(inputCol="selected_features", outputCol="normalized_features", p=2.0)



In [None]:
# 8. Create SVM classifier
# Create base binary classifier outside the pipeline definition
svm = LinearSVC(featuresCol="normalized_features", labelCol="label")

# Wrap in OneVsRest for multi-class classification
ovr = OneVsRest(classifier=svm, featuresCol="normalized_features", labelCol="label", predictionCol="prediction")

# Build the pipeline with 2000 features
pipeline_2000 = Pipeline(stages=[
    label_indexer,
    tokenizer,
    stopwords_remover,
    count_vectorizer,
    idf,
    chi_sq_selector_2000,
    normalizer,
    ovr
])

# Build the pipeline with 500 features
pipeline_500 = Pipeline(stages=[
    label_indexer,
    tokenizer,
    stopwords_remover,
    count_vectorizer,
    idf,
    chi_sq_selector_500,
    normalizer,
    ovr
])

# Define the SVM parameter grid
# Create it once and broadcast to reduce serialization overhead
print("Setting up parameter grid for SVM optimization...")
# Use getParam instead of direct access to avoid serialization issues
param_grid = ParamGridBuilder() \
    .addGrid(svm.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(svm.standardization, [True, False]) \
    .addGrid(svm.maxIter, [10, 50]) \
    .build()

# Create an evaluator for model assessment
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="f1"
)

# Create cross-validators with reduced number of folds to speed up training
cv_2000 = CrossValidator(
    estimator=pipeline_2000,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    numFolds=3,  # Using 3 folds as specified in the original code
    parallelism=4,  # Add parallelism to speed up training
    seed=seed
)

cv_500 = CrossValidator(
    estimator=pipeline_500,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    numFolds=3,
    parallelism=4,  # Add parallelism to speed up training
    seed=seed
)

# Function to extract parameter settings from a model - optimized version
def extract_params(pipeline_model):
    for stage in pipeline_model.stages:
        if hasattr(stage, 'subModels'):
            ovr_model = stage
            if hasattr(ovr_model, 'subModels') and len(ovr_model.subModels) > 0:
                base_model = ovr_model.subModels[0]
                return {
                    "regParam": base_model.regParam,
                    "standardization": base_model.standardization,
                    "maxIter": base_model.maxIter
                }
    return None

# Results container
results = []



In [None]:
# Train model with 2000 features
print("\nTraining model with 2000 features...")
print("This may take some time...")
cv_model_2000 = cv_2000.fit(train_data)

# Get the best model
best_model_2000 = cv_model_2000.bestModel
params_2000 = extract_params(best_model_2000)

# Apply the best model to the test set and cache to improve performance
predictions_2000 = best_model_2000.transform(test_data).cache()

# Evaluate the model
f1_score_2000 = evaluator.evaluate(predictions_2000)

# Store results
results.append({
    "feature_set": "2000 features",
    "regParam": params_2000["regParam"],
    "standardization": params_2000["standardization"],
    "maxIter": params_2000["maxIter"],
    "f1_score": f1_score_2000
})

print(f"\nBest parameters for 2000 features:")
print(f"  regParam: {params_2000['regParam']}")
print(f"  standardization: {params_2000['standardization']}")
print(f"  maxIter: {params_2000['maxIter']}")
print(f"  F1 score on test set: {f1_score_2000:.4f}")



In [None]:
# Train model with 500 features
print("\nTraining model with 500 features...")
print("This may take some time...")
cv_model_500 = cv_500.fit(train_data)

# Get the best model
best_model_500 = cv_model_500.bestModel
params_500 = extract_params(best_model_500)

# Apply the best model to the test set and cache
predictions_500 = best_model_500.transform(test_data).cache()

# Evaluate the model
f1_score_500 = evaluator.evaluate(predictions_500)

# Store results
results.append({
    "feature_set": "500 features",
    "regParam": params_500["regParam"],
    "standardization": params_500["standardization"],
    "maxIter": params_500["maxIter"],
    "f1_score": f1_score_500
})

print(f"\nBest parameters for 500 features:")
print(f"  regParam: {params_500['regParam']}")
print(f"  standardization: {params_500['standardization']}")
print(f"  maxIter: {params_500['maxIter']}")
print(f"  F1 score on test set: {f1_score_500:.4f}")



In [None]:
# Find which model performed better
best_predictions = predictions_2000 if f1_score_2000 > f1_score_500 else predictions_500
best_feature_set = "2000 features" if f1_score_2000 > f1_score_500 else "500 features"

# Get label mapping from the indexer - optimize by caching the result
label_mapping_model = label_indexer.fit(df)
label_mapping = {idx: cat for idx, cat in enumerate(label_mapping_model.labels)}

# Convert predictions to a pandas DataFrame - limit columns to only what's needed
pred_df = best_predictions.select("category", "prediction", "label").limit(10000).toPandas()
true_labels = pred_df["label"]
pred_labels = pred_df["prediction"]

# Get top categories efficiently
top_categories = category_counts.limit(10).toPandas()['category'].tolist()
category_to_idx = {cat: idx for idx, cat in enumerate(label_mapping_model.labels)}
top_indices = [category_to_idx[cat] for cat in top_categories]

# Initialize confusion matrix
conf_matrix = np.zeros((len(top_indices), len(top_indices)), dtype=int)

# Fill confusion matrix efficiently
for i, true_cat in enumerate(top_indices):
    for j, pred_cat in enumerate(top_indices):
        conf_matrix[i, j] = sum((true_labels == true_cat) & (pred_labels == pred_cat))

# Determine the overall best model configuration
best_config = max(results, key=lambda x: x['f1_score'])
print("\nBest Overall Configuration:")
print(f"Feature Set: {best_config['feature_set']}")
print(f"regParam: {best_config['regParam']}")
print(f"standardization: {best_config['standardization']}")
print(f"maxIter: {best_config['maxIter']}")
print(f"F1 Score: {best_config['f1_score']:.4f}")

# Save the best model parameters
with open("best_model_params.txt", "w") as f:
    f.write(f"Feature Set: {best_config['feature_set']}\n")
    f.write(f"regParam: {best_config['regParam']}\n")
    f.write(f"standardization: {best_config['standardization']}\n")
    f.write(f"maxIter: {best_config['maxIter']}\n")
    f.write(f"F1 Score: {best_config['f1_score']:.4f}\n")

print("\nBest model parameters saved to 'best_model_params.txt'")

# Conclusion
print("\nConclusion:")
print("We have successfully implemented an optimized text classification pipeline using Spark ML.")
print("The pipeline includes text preprocessing, feature extraction with TF-IDF, and SVM classification.")
print("We compared two feature dimensions (2000 vs 500 features) and varied SVM parameters.")
print(f"The best model achieved an F1 score of {best_config['f1_score']:.4f} on the test set.")

# Clean up and release resources
train_data.unpersist()
validation_data.unpersist()
test_data.unpersist()
predictions_2000.unpersist()
predictions_500.unpersist()
df.unpersist()

# Stop Spark session
spark.stop()
print("Spark session stopped and resources released")