## Setting up Spark Session / Context

In [1]:
import math
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, BooleanType, DoubleType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/ubuntu/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/ubuntu/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/ubuntu/.local/lib/python3.10/site-pac

AttributeError: _ARRAY_API not found

In [2]:
# Spark cluster init
def bytes_to_mb(size_bytes):
    """Converts bytes to megabytes."""
    return size_bytes / (1024 * 1024)

def bytes_to_gb(size_bytes):
    """Converts bytes to gigabytes."""
    return size_bytes / (1024 * 1024 * 1024)

def configure_spark(dataset_size_gb):
    """Configures Spark based on dataset size.

    Args:
        dataset_size_gb (float): Size of the dataset in gigabytes.

    Returns:
        tuple: Executor cores and memory configuration.
    """

    core_factor   = 2
    memory_factor = 1
    executor_cores  = int(dataset_size_gb * core_factor) + 2
    # Ensure executor_cores is even
    if executor_cores % 2 != 0:
        executor_cores += 1
    executor_memory = f"{min(executor_cores, 4) * memory_factor}g"
    
    return executor_cores, executor_memory


def build_spark_session(hdfs_path, file_path, verbose=False):
    """Builds a Spark session and retrieves file size from HDFS.

    Args:
        hdfs_path (str): HDFS path.
        file_path (str): File path within HDFS.
        verbose (bool, optional): Enable verbose output. Defaults to False.

    Returns:
        tuple: SparkSession, SparkContext, and file size.
    """
    spark = SparkSession.builder.appName("Project Group 32 HDFSFileSize").getOrCreate()
    jvm = spark._jvm
    conf = jvm.org.apache.hadoop.conf.Configuration()
    fs = jvm.org.apache.hadoop.fs.FileSystem.get(jvm.java.net.URI.create(hdfs_path), conf)
    path = jvm.org.apache.hadoop.fs.Path(file_path)
    fileStatus = fs.getFileStatus(path)
    fileSize = fileStatus.getLen()

    if verbose:
        print(f"File size in bytes: {fileSize}")

    spark.stop()

    executor_cores, executor_memory = configure_spark(bytes_to_gb(fileSize))

    spark_session = SparkSession.builder\
            .master("spark://192.168.2.156:7077") \
            .appName("Project Group 32")\
            .config("spark.dynamicAllocation.enabled", True)\
            .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
            .config("spark.shuffle.service.enabled", False)\
            .config("spark.dynamicAllocation.executorIdleTimeout","60s")\
            .config("spark.executor.cores", min(executor_cores, 4))\
            .config("spark.executor.memory", executor_memory)\
            .config("spark.cores.max", min(executor_cores, 32))\
            .config("spark.driver.port",9999)\
            .config("spark.blockManager.port",10005)\
            .getOrCreate()

    # RDD API
    spark_context = spark_session.sparkContext
    spark_context.setLogLevel("ERROR")

    if verbose:
        print(f"A files size of {bytes_to_gb(fileSize):.4f} GB give a maximum \n"+
              f"of {spark_session.conf.get('spark.cores.max')} cores divided on spark executors with:\n"+
            f"Executor cores: {spark_session.conf.get('spark.executor.cores')}\n"+
            f"Executor memory: {spark_session.conf.get('spark.executor.memory')}\n"+
            f"Mem/core: {int(spark_session.conf.get('spark.executor.memory')[:-1])/int(spark_session.conf.get('spark.executor.cores')):.0f}GB")

    return spark_session, spark_context, fileSize

## Create a dataframe to analyse the posts line by line

In [3]:
def load_data(spark_session, hdfs_path, file_path, fileSize, schema=None, verbose=False):
    """Loads JSON data from HDFS into a Spark DataFrame.

    Args:
        spark_session (SparkSession): Spark session.
        hdfs_path (str): HDFS path.
        file_path (str): File path within HDFS.
        fileSize (int): Size of the file in bytes.
        schema: Schema for the JSON object. Defaults to None which infers schema from the data.
        verbose (bool, optional): Enable verbose output. Defaults to False.

    Returns:
        DataFrame: Loaded Spark DataFrame.
    """
    # Load JSON file into a Spark DataFrame
    if schema is None:
        df = spark_session.read.json(hdfs_path + file_path)
        # Show schema to understand the structure
        print("The schema:")
        df.printSchema()
        print("\n")
    else:
        df = spark_session.read.json(hdfs_path + file_path, schema=schema)

    if verbose:
        # Count the number of partitions in the underlying RDD.
        print(f"Number of default partitions after loading the data: {df.rdd.getNumPartitions()}")
        print("\n")

    # Repartition using "subreddit" as key.
    # The partition size matches the HDFS block size in MB.
    no_partitions = math.ceil(bytes_to_mb(fileSize) / 128)
    partition_key =  "subreddit"
    df.repartition(no_partitions, partition_key)
    if verbose:
        print(f"The data is now repartitoned on key: '{partition_key}', into {df.rdd.getNumPartitions()} partitions.")
        print("\n")

    return df

## How many Subreddits do exist?

-- We see that many post are not assigned to a Subreddit, since we want to train a Classification model, we delete the NULL post --

In [4]:
def filter_data(df):

    # Filter out NULL subreddit, summary, or content
    df_filtered = df.filter((col("subreddit").isNotNull()) & (col("summary").isNotNull()) & (col("content").isNotNull()))

    # Group on subreddit and create a "count" for each in descending order
    df_counts = df_filtered.groupBy("subreddit").count().orderBy(col("count").desc())

    # Retrieve the top 25 subreddits
    top_25_counts = df_counts.limit(25)
    
    # Collect (transfer them locally, not distributed) and put into a list
    top_25_subreddits = [row.subreddit for row in top_25_counts.collect()]

    # Filter the null-filtered data based on the top 25 subreddits
    df_filtered = df_filtered.filter(col("subreddit").isin(top_25_subreddits))

    return df_filtered



In [5]:
def split_data(df, seed=42, test_fraction=0.2):
    # Split data into training and test sets
    train_data, test_data = df.randomSplit([(1-test_fraction), test_fraction], seed=seed)

    return train_data, test_data

## To prepare the Data for our ML Classification Model, we use the columns summary and content

## Create the training and test datasets

## We have to make the Text understandable for the algorithm

1. We first tokenize the the columns
2. Remove stop words, since they do not add information to the text
3. We convert the Text with TF-IDF to numbers

In [6]:
def pre_processing_pipe():
    """
    Creates a pipeline for pre-processing text data for machine learning.

    This pipeline includes tokenization, stop word removal, TF-IDF vectorization,
    label indexing, and feature assembly.

    Returns:
        list: A list of Spark ML pipeline stages.
    """
    # Tokenize summary and content
    tokenizer  = Tokenizer(inputCol="summary", outputCol="summary_tokens")
    tokenizer2 = Tokenizer(inputCol="content", outputCol="content_tokens")

    # Remove stopwords
    stopwords_remover  = StopWordsRemover(inputCol="summary_tokens", outputCol="summary_clean")
    stopwords_remover2 = StopWordsRemover(inputCol="content_tokens", outputCol="content_clean")

    # Convert words to numerical features using TF-IDF
    hashing_tf = HashingTF(inputCol="summary_clean", outputCol="summary_tf", numFeatures=1000)
    idf = IDF(inputCol="summary_tf", outputCol="summary_features")

    hashing_tf2 = HashingTF(inputCol="content_clean", outputCol="content_tf", numFeatures=1000)
    idf2 = IDF(inputCol="content_tf", outputCol="content_features")

    # Convert subreddit (text label) into a numerical label
    label_indexer = StringIndexer(inputCol="subreddit", outputCol="label", handleInvalid="keep")

    # Combine summary and content features
    feature_assembler = VectorAssembler(inputCols=["summary_features", "content_features"], outputCol="features")

    # Return pre-processing pipeline.
    return [tokenizer, tokenizer2, stopwords_remover, stopwords_remover2,
            hashing_tf, idf, hashing_tf2, idf2, label_indexer, feature_assembler]

In [7]:

def model_eval(model, test_data, description="", verbose=False):
    """
    Evaluates a machine learning model's accuracy on test data.

    Args:
        model: The trained Spark ML model.
        test_data (DataFrame): The test dataset.
        description (str, optional): A description of the model for output. Defaults to "".
        verbose (bool, optional): Enable verbose output. Defaults to False.

    Returns:
        float: The accuracy of the model.
    """
    # Make predictions on test data
    predictions = model.transform(test_data)

    # Evaluate model accuracy
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    if verbose:
      print(f"Evaluation of {description}. \n"+
            f"Model Accuracy: {accuracy:.4f}")
    
    return accuracy


In [8]:


def random_forest(train_data, pre_pipe):
    """
    Trains a Random Forest classification model.

    Args:
        train_data (DataFrame): The training dataset.
        pre_pipe (list): List of pre-processing stages.

    Returns:
        PipelineModel: The trained Random Forest model.
    """
    # Define the Random Forest classifier
    classifier = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)

    # Create a new pipeline using Random Forest
    pipeline = Pipeline(stages= pre_pipe + [classifier])

    # Train the model
    model = pipeline.fit(train_data)

    # Save the trained model
    #model.save("hdfs://192.168.2.156:9000/data/reddit/model/reddit_text_classifier_rf")
    
    return model

In [9]:

def logistic_regression(train_data, pre_pipe):
    """
    Trains a Logistic Regression classification model.

    Args:
        train_data (DataFrame): The training dataset.
        pre_pipe (list): List of pre-processing stages.

    Returns:
        PipelineModel: The trained Logistic Regression model.
    """
    # Define the classification model
    classifier = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

    # Create a new pipeline using Logistic Regression
    pipeline = Pipeline(stages= pre_pipe + [classifier])

    # Train the model
    model = pipeline.fit(train_data)

    # Save the trained model
    # model.save("hdfs://192.168.2.156:9000/data/reddit/model/reddit_text_classifier")

    return model

In [10]:
def print_results(i, fileSize, executor_cores, executor_memory, max_cores, data_load_time, training_time, evaluation_time, overall_exec_time, model_accuracy):
    """
    Prints performance and evaluation results.

    Args:
        file_path (str): Path of the input file.
        fileSize (float): Size of the input file in GB.
        no_samples (int): Number of samples processed.
        executor_cores (int): Number of executor cores.
        executor_memory (str): Executor memory configuration.
        execution_time (float): Total execution time in seconds.
        accuracy_rf (float): Accuracy of the Random Forest model.
        accuracy_lr (float): Accuracy of the Logistic Regression model.
    """
    print("-" * 80)
    print("Spark Processing and Model Evaluation Results\n")
    print(f"Iteration {i}")
    print("-" * 80)

    print(f"File Size:        {fileSize:.2f} GB")
    print(f"Max cores:        {max_cores}")
    print(f"Executor Cores:   {executor_cores}")
    print(f"Executor Memory:  {executor_memory}")

    print("-" * 80)
    print("Performance Metrics:")
    print("-" * 80)

    print(f"Data load Time:         {data_load_time:.2f} seconds")
    print(f"Training Time:          {training_time:.2f} seconds")
    print(f"Evaluation Time:        {evaluation_time:.2f} seconds")
    print(f"Overall execution Time: {overall_exec_time:.2f} seconds")

    print("-" * 80)
    print("Model Accuracy:")
    print("-" * 80)

    print(f"Random Forest Accuracy:     {model_accuracy:.4f}")

    print("-" * 80)

In [11]:
def evaluate_performance(hdfs_path, file_path, schema, n=5, verbose=False):

    overall_exec_time   = np.zeros(n)
    data_load_time      = np.zeros(n)
    training_time       = np.zeros(n)
    evaluation_time     = np.zeros(n)
    model_accuracy      = np.zeros(n)

    for i in range(n):

        print(f"File {file_path}, run {i}")
        start_time = time.time()

        # Create a spark session
        spark_session, spark_context, fileSize = build_spark_session(hdfs_path, file_path, verbose=verbose)
        
        # Load the data
        df = load_data(spark_session, hdfs_path, file_path, fileSize, schema=schema, verbose=verbose)

        # Filter the data
        df = filter_data(df)

        # Split data into training and test sets
        train_data, test_data = split_data(df)

        # Save time for data load/transform
        data_time = time.time()
        data_load_time[i] = data_time - start_time

        # Create a pipeline for the pre-processing
        pre_pipe = pre_processing_pipe()
        # Create and train a ML model for classification
        model_rf = random_forest(train_data, pre_pipe)

        # Save time for model training
        train_time = time.time()
        training_time[i] = train_time - data_time

        # Evaluate the performance of the ML model on the test data
        model_accuracy[i] = model_eval(model_rf, test_data, description="Random forest classifier", verbose=verbose)

        # Save time for model evaluation
        eval_time = time.time()
        evaluation_time[i] = eval_time - train_time

        executor_cores = spark_session.conf.get("spark.executor.cores")
        executor_memory = spark_session.conf.get("spark.executor.memory")
        max_cores = spark_session.conf.get('spark.cores.max')

        spark_context.stop()

        # Determine overall execution time
        end_time = time.time()
        overall_exec_time[i] = end_time - start_time
        
        if verbose:
            print_results(i, bytes_to_gb(fileSize), executor_cores, executor_memory, max_cores, data_load_time[i], training_time[i], evaluation_time[i], overall_exec_time[i], model_accuracy[i])

    return [bytes_to_gb(fileSize), executor_cores, executor_memory, max_cores, data_load_time.mean(), training_time.mean(), evaluation_time.mean(), overall_exec_time.mean(), model_accuracy.mean()]        

In [12]:
# Define schemas for the reddit data
subreddit_field = StructField(name="subreddit", dataType=StringType(), nullable=True)
summary_field   = StructField(name="summary",   dataType=StringType(), nullable=True)
content_field   = StructField(name="content",   dataType=StringType(), nullable=True)
body_field      = StructField(name="body",      dataType=StringType(), nullable=True)

schema_v0 = StructType([subreddit_field])
schema_v1 = StructType([subreddit_field, summary_field, content_field])

Evaluate all reddit datasets five times and save the average as a result. 

In [None]:
results = list()
hdfs_path = "hdfs://192.168.2.156:9000"

file_path = "/data/reddit/"

files = ["reddit_50k.json", "reddit_100k.json", 
         "reddit_200k.json", "reddit_500k.json", 
         "corpus-webis-tldr-17.json"]

for file in files:
    try:
        res = evaluate_performance(hdfs_path, f"{file_path}{file}", schema=schema_v1, n=5)
        results.append([file[:-5]] + res)

    except Exception as e:
        print(f"Crashed when evaluating {file} with error:")
        print(str(e))


result_df = pd.DataFrame(results, columns=['File', 'File size', 'Executor cores', 'Executor memory', 'Max cores', 'Data load time', 'Training time', 'Evaluation time', 'Overall exec time', 'Model accuracy']) 
print(result_df)

File /data/reddit/reddit_50k.json, run 0


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/11 22:03:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

File /data/reddit/reddit_50k.json, run 1


                                                                                

In [None]:
out_path = "/home/ubuntu/out/"
result_df.to_csv(path_or_buf=f"{out_path}performance_data.csv")

In [None]:
!pip install openpyxl 
with pd.ExcelWriter(f"{out_path}performance_data.xlsx", engine="openpyxl") as writer:
    result_df.to_excel(writer) 