## Setting up Spark Session / Context

In [179]:
import math
import time
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression


In [180]:
def bytes_to_mb(size_bytes):
    return size_bytes / (1024 * 1024)

def bytes_to_gb(size_bytes):
    return size_bytes / (1024 * 1024 * 1024)

def configure_spark(dataset_size_gb):
    """Configures Spark based on dataset size."""
    if dataset_size_gb < 1:
        executor_cores = 2
        executor_memory = "4g"
    elif 1 <= dataset_size_gb <= 10:
        executor_cores = 4
        executor_memory = "8g"
    else:
        executor_cores = 8
        executor_memory = "16g"
    return executor_cores, executor_memory


def build_spark_session(hdfs_path, file_path, verbose=False):
    spark = SparkSession.builder.appName("Project Group 32 HDFSFileSize").getOrCreate()
    jvm = spark._jvm
    conf = jvm.org.apache.hadoop.conf.Configuration()
    fs = jvm.org.apache.hadoop.fs.FileSystem.get(jvm.java.net.URI.create(hdfs_path), conf)
    path = jvm.org.apache.hadoop.fs.Path(file_path)
    fileStatus = fs.getFileStatus(path)
    fileSize = fileStatus.getLen()

    if verbose:
        print(f"File size in bytes: {fileSize}")

    spark.stop()

    executor_cores, executor_memory = configure_spark(bytes_to_gb(fileSize))

    if verbose:
        print(f"A files size of {bytes_to_gb(fileSize):.4f} GB give spark executors with:\n"+
            f"Cores: {executor_cores}\n"+
            f"Mem/core: {int(executor_memory[:-1])/executor_cores:.0f}GB")


    spark_session = SparkSession.builder\
            .master("spark://192.168.2.156:7077") \
            .appName("Project Group 32 Andreas")\
            .config("spark.dynamicAllocation.enabled", True)\
            .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
            .config("spark.shuffle.service.enabled", False)\
            .config("spark.dynamicAllocation.executorIdleTimeout","60s")\
            .config("spark.executor.cores", executor_cores)\
            .config("spark.executor.memory", executor_memory)\
            .config("spark.driver.port",9999)\
            .config("spark.blockManager.port",10005)\
            .getOrCreate()

    # RDD API
    spark_context = spark_session.sparkContext
    spark_context.setLogLevel("ERROR")

    if verbose:
        print(f"Executor cores: {spark_session.conf.get('spark.executor.cores')}")
        print(f"Executor memory: {spark_session.conf.get('spark.executor.memory')}")

    return spark_session, spark_context, fileSize

## Create a dataframe to analyse the posts line by line

In [181]:
def load_data(spark_session, hdfs_path, file_path, fileSize, verbose=False):
    # Load JSON file into a Spark DataFrame
    df = spark_session.read.json(hdfs_path + file_path)

    if verbose:
        # Count the number of partitions in the underlying RDD.
        print(f"Number of default partitions after loading the data: {df.rdd.getNumPartitions()}")
        print("\n")

    # Repartition using "subreddit" as key.
    # The partition size matches the HDFS block size in MB.
    no_partitions = math.ceil(bytes_to_mb(fileSize) / 128)
    partition_key =  "subreddit"
    df.repartition(no_partitions, partition_key)
    if verbose:
        print(f"The data is now repartitoned on key: '{partition_key}', into {df.rdd.getNumPartitions()} partitions.")
        print("\n")

        # Show schema to understand the structure
        print("The schema:")
        df.printSchema()
        print("\n")

        # Show first few rows to inspect data
        print("The first five entries in the dataframe:")
        df.show(5, truncate=False)
        print("\n")

        # Count total number of rows
        print(f"Total Rows: {df.count()}")
    return df

## How many Subreddits do exist?

-- We see that many post are not assigned to a Subreddit, since we want to train a Classification model, we delete the NULL post --

In [182]:

def filter_and_split_data(df, seed=42, verbose=False):
    unique_subreddits = df.select("subreddit").distinct().count()
    if verbose:
        print(f"Unique Subreddits: {unique_subreddits}")
        df.groupBy("subreddit").count().orderBy(col("count").desc()).show(10, False)
    else:
        df.groupBy("subreddit").count().orderBy(col("count").desc())

    # Filter out rows where subreddit is NULL
    df_filtered = df.filter(col("subreddit").isNotNull())

    if verbose:
        # Show first few rows after filtering
        df_filtered.show(5, truncate=False)

        # Count remaining rows
        print(f"Total Posts After Filtering: {df_filtered.count()}")

    # Filter out NULL subreddit, summary, or content
    df_filtered = df.filter((col("subreddit").isNotNull()) & (col("summary").isNotNull()) & (col("content").isNotNull()))

    if verbose:
        # Show filtered data
        df_filtered.select("subreddit", "summary", "content").show(5, truncate=False)

    # Split data into training and test sets
    train_data, test_data = df_filtered.randomSplit([0.8, 0.2], seed=seed)

    return train_data, test_data, df_filtered.count()

## To prepare the Data for our ML Classification Model, we use the columns summary and content

## Create the training and test datasets

## We have to make the Text understandable for the algorithm

1. We first tokenize the the columns
2. Remove stop words, since they do not add information to the text
3. We convert the Text with TF-IDF to numbers

In [183]:
def pre_processing_pipe():
    # Tokenize summary and content
    tokenizer  = Tokenizer(inputCol="summary", outputCol="summary_tokens")
    tokenizer2 = Tokenizer(inputCol="content", outputCol="content_tokens")

    # Remove stopwords
    stopwords_remover  = StopWordsRemover(inputCol="summary_tokens", outputCol="summary_clean")
    stopwords_remover2 = StopWordsRemover(inputCol="content_tokens", outputCol="content_clean")

    # Convert words to numerical features using TF-IDF
    hashing_tf = HashingTF(inputCol="summary_clean", outputCol="summary_tf", numFeatures=1000)
    idf = IDF(inputCol="summary_tf", outputCol="summary_features")

    hashing_tf2 = HashingTF(inputCol="content_clean", outputCol="content_tf", numFeatures=1000)
    idf2 = IDF(inputCol="content_tf", outputCol="content_features")

    # Convert subreddit (text label) into a numerical label
    label_indexer = StringIndexer(inputCol="subreddit", outputCol="label", handleInvalid="keep")

    # Combine summary and content features
    feature_assembler = VectorAssembler(inputCols=["summary_features", "content_features"], outputCol="features")

    # Return pre-processing pipeline.
    return [tokenizer, tokenizer2, stopwords_remover, stopwords_remover2,
            hashing_tf, idf, hashing_tf2, idf2, label_indexer, feature_assembler]

In [184]:

def model_eval(model, test_data, description="", verbose=False):
    # Make predictions on test data
    predictions = model.transform(test_data)

    # Evaluate model accuracy
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    if verbose:
      print(f"Evaluation of {description}. \n"+
            f"Model Accuracy: {accuracy:.4f}")
    
    return accuracy


In [185]:


def random_forest(train_data, pre_pipe):
    # Define the Random Forest classifier
    classifier = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)

    # Create a new pipeline using Random Forest
    pipeline = Pipeline(stages= pre_pipe + [classifier])

    # Train the model
    model = pipeline.fit(train_data)

    # Save the trained model
    #model.save("hdfs://192.168.2.156:9000/data/reddit/model/reddit_text_classifier_rf")
    
    return model

In [186]:

def logistic_regression(train_data, pre_pipe):
    # Define the classification model
    classifier = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

    # Create a new pipeline using Logistic Regression
    pipeline = Pipeline(stages= pre_pipe + [classifier])

    # Train the model
    model = pipeline.fit(train_data)

    # Save the trained model
    # model.save("hdfs://192.168.2.156:9000/data/reddit/model/reddit_text_classifier")

    return model

In [187]:
def print_results(file_path, fileSize, no_samples, executor_cores, executor_memory, execution_time, accuracy_rf, accuracy_lr):
    """
    Performance and evaluation results
    """
    print("-" * 80)
    print("Spark Processing and Model Evaluation Results")
    print("-" * 80)

    print(f"File Path:        {file_path}")
    print(f"File Size:        {fileSize:.2f} GB")
    print(f"No samples:       {no_samples}")
    print(f"Executor Cores:   {executor_cores}")
    print(f"Executor Memory:  {executor_memory}")

    print("-" * 80)
    print("Performance Metrics:")
    print("-" * 80)

    print(f"Execution Time:   {execution_time:.2f} seconds")

    print("-" * 80)
    print("Model Accuracy:")
    print("-" * 80)

    print(f"Random Forest Accuracy:     {accuracy_rf:.4f}")
    print(f"Logistic Regression Accuracy: {accuracy_lr:.4f}")

    print("-" * 80)

In [188]:
hdfs_path = "hdfs://192.168.2.156:9000"


In [None]:
verbose=True
file_path = "/data/reddit/reddit_50k.json"
reddit_50_samples = np.zeros(5)
reddit_50_time    = np.zeros(5)
reddit_50_rf_acc  = np.zeros(5)
reddit_50_lr_acc  = np.zeros(5)

for i in range(1):
    start_time = time.time()

    spark_session, spark_context, fileSize = build_spark_session(hdfs_path, file_path, verbose=verbose)
    df = load_data(spark_session, hdfs_path, file_path, fileSize, verbose=verbose)
    train_data, test_data, no_samples = filter_and_split_data(df, verbose=verbose)
    pre_pipe = pre_processing_pipe()
    model_rf = random_forest(train_data, pre_pipe)
    model_lr = logistic_regression(train_data, pre_pipe)
    accuracy_rf = model_eval(model_rf, test_data, description="Random forest classifier")
    accuracy_lr = model_eval(model_lr, test_data, description="Logistic regression classifier")

    executor_cores = spark_session.conf.get("spark.executor.cores")
    executor_memory = spark_session.conf.get("spark.executor.memory")
    spark_context.stop()

    end_time = time.time()
    execution_time = end_time - start_time

    reddit_50_samples[i] = no_samples
    reddit_50_time[i]    = execution_time
    reddit_50_rf_acc[i]  = accuracy_rf
    reddit_50_lr_acc[i]  = accuracy_lr

    if verbose:
        print_results(file_path=file_path, fileSize=bytes_to_gb(fileSize), no_samples=no_samples, executor_cores=executor_cores, 
                    executor_memory=executor_memory, execution_time=execution_time, 
                    accuracy_rf=accuracy_rf, accuracy_lr=accuracy_lr)
            
print(reddit_50_samples.mean())
print(reddit_50_samples)

print(reddit_50_time.mean())
print(reddit_50_time)

print(reddit_50_rf_acc.mean())
print(reddit_50_rf_acc)

print(reddit_50_lr_acc.mean())
print(reddit_50_lr_acc)


File size in bytes: 392090252
A files size of 0.3652 GB give spark executors with:
Cores: 2
Mem/core: 2GB
Executor cores: 2
Executor memory: 4g


                                                                                

Number of default partitions after loading the data: 4


The data is now repartitoned on key: 'subreddit', into 4 partitions.


The schema:
root
 |-- _corrupt_record: string (nullable = true)
 |-- author: string (nullable = true)
 |-- body: string (nullable = true)
 |-- content: string (nullable = true)
 |-- content_len: long (nullable = true)
 |-- id: string (nullable = true)
 |-- normalizedBody: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- summary_len: long (nullable = true)
 |-- title: string (nullable = true)



The first five entries in the dataframe:
+---------------+----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

Total Rows: 99999


                                                                                

Unique Subreddits: 3605


                                                                                

+-----------------+-----+
|subreddit        |count|
+-----------------+-----+
|NULL             |50182|
|AskReddit        |11909|
|leagueoflegends  |1256 |
|AdviceAnimals    |975  |
|funny            |880  |
|gaming           |762  |
|pics             |740  |
|politics         |738  |
|atheism          |703  |
|explainlikeimfive|631  |
+-----------------+-----+
only showing top 10 rows

+---------------+----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

Total Posts After Filtering: 49817
+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------



In [None]:
verbose=False
file_path = "/data/reddit/reddit_100k.json"
reddit_100_samples = np.zeros(5)
reddit_100_time    = np.zeros(5)
reddit_100_rf_acc  = np.zeros(5)
reddit_100_lr_acc  = np.zeros(5)

for i in range(5):
    start_time = time.time()

    spark_session, spark_context, fileSize = build_spark_session(hdfs_path, file_path, verbose=verbose)
    df = load_data(spark_session, hdfs_path, file_path, fileSize, verbose=verbose)
    train_data, test_data, no_samples = filter_and_split_data(df, verbose=verbose)
    pre_pipe = pre_processing_pipe()
    model_rf = random_forest(train_data, pre_pipe)
    model_lr = logistic_regression(train_data, pre_pipe)
    accuracy_rf = model_eval(model_rf, test_data, description="Random forest classifier", verbose=verbose)
    accuracy_lr = model_eval(model_lr, test_data, description="Logistic regression classifier", verbose=verbose)

    executor_cores = spark_session.conf.get("spark.executor.cores")
    executor_memory = spark_session.conf.get("spark.executor.memory")
    spark_context.stop()

    end_time = time.time()
    execution_time = end_time - start_time

    reddit_50_samples[i] = no_samples
    reddit_50_time[i]    = execution_time
    reddit_50_rf_acc[i]  = accuracy_rf
    reddit_50_lr_acc[i]  = accuracy_lr

    if verbose:
        print_results(file_path=file_path, fileSize=bytes_to_gb(fileSize), no_samples=no_samples, executor_cores=executor_cores, 
                    executor_memory=executor_memory, execution_time=execution_time, 
                    accuracy_rf=accuracy_rf, accuracy_lr=accuracy_lr)
            
print(reddit_100_samples.mean())
print(reddit_100_samples)

print(reddit_100_time.mean())
print(reddit_100_time)

print(reddit_100_rf_acc.mean())
print(reddit_100_rf_acc)

print(reddit_100_lr_acc.mean())
print(reddit_100_lr_acc)


Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=57>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ubuntu/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", 

In [None]:
verbose=False
file_path = "/data/reddit/reddit_200k.json"
reddit_200_samples = np.zeros(5)
reddit_200_time    = np.zeros(5)
reddit_200_rf_acc  = np.zeros(5)
reddit_200_lr_acc  = np.zeros(5)

for i in range(5):
    start_time = time.time()

    spark_session, spark_context, fileSize = build_spark_session(hdfs_path, file_path, verbose=verbose)
    df = load_data(spark_session, hdfs_path, file_path, fileSize, verbose=verbose)
    train_data, test_data, no_samples = filter_and_split_data(df, verbose=verbose)
    pre_pipe = pre_processing_pipe()
    model_rf = random_forest(train_data, pre_pipe)
    model_lr = logistic_regression(train_data, pre_pipe)
    accuracy_rf = model_eval(model_rf, test_data, description="Random forest classifier", verbose=verbose)
    accuracy_lr = model_eval(model_lr, test_data, description="Logistic regression classifier", verbose=verbose)

    executor_cores = spark_session.conf.get("spark.executor.cores")
    executor_memory = spark_session.conf.get("spark.executor.memory")
    spark_context.stop()

    end_time = time.time()
    execution_time = end_time - start_time

    reddit_50_samples[i] = no_samples
    reddit_50_time[i]    = execution_time
    reddit_50_rf_acc[i]  = accuracy_rf
    reddit_50_lr_acc[i]  = accuracy_lr

    if verbose:
        print_results(file_path=file_path, fileSize=bytes_to_gb(fileSize), no_samples=no_samples, executor_cores=executor_cores, 
                    executor_memory=executor_memory, execution_time=execution_time, 
                    accuracy_rf=accuracy_rf, accuracy_lr=accuracy_lr)
            
print(reddit_200_samples.mean())
print(reddit_200_samples)

print(reddit_200_time.mean())
print(reddit_200_time)

print(reddit_200_rf_acc.mean())
print(reddit_200_rf_acc)

print(reddit_200_lr_acc.mean())
print(reddit_200_lr_acc)


In [None]:
verbose=False
file_path = "/data/reddit/reddit_500k.json"
reddit_500_samples = np.zeros(5)
reddit_500_time    = np.zeros(5)
reddit_500_rf_acc  = np.zeros(5)
reddit_500_lr_acc  = np.zeros(5)

for i in range(5):
    start_time = time.time()

    spark_session, spark_context, fileSize = build_spark_session(hdfs_path, file_path, verbose=verbose)
    df = load_data(spark_session, hdfs_path, file_path, fileSize, verbose=verbose)
    train_data, test_data, no_samples = filter_and_split_data(df, verbose=verbose)
    pre_pipe = pre_processing_pipe()
    model_rf = random_forest(train_data, pre_pipe)
    model_lr = logistic_regression(train_data, pre_pipe)
    accuracy_rf = model_eval(model_rf, test_data, description="Random forest classifier", verbose=verbose)
    accuracy_lr = model_eval(model_lr, test_data, description="Logistic regression classifier", verbose=verbose)

    executor_cores = spark_session.conf.get("spark.executor.cores")
    executor_memory = spark_session.conf.get("spark.executor.memory")
    spark_context.stop()

    end_time = time.time()
    execution_time = end_time - start_time

    reddit_50_samples[i] = no_samples
    reddit_50_time[i]    = execution_time
    reddit_50_rf_acc[i]  = accuracy_rf
    reddit_50_lr_acc[i]  = accuracy_lr

    if verbose:
        print_results(file_path=file_path, fileSize=bytes_to_gb(fileSize), no_samples=no_samples, executor_cores=executor_cores, 
                    executor_memory=executor_memory, execution_time=execution_time, 
                    accuracy_rf=accuracy_rf, accuracy_lr=accuracy_lr)
            
print(reddit_500_samples.mean())
print(reddit_500_samples)

print(reddit_500_time.mean())
print(reddit_500_time)

print(reddit_500_rf_acc.mean())
print(reddit_500_rf_acc)

print(reddit_500_lr_acc.mean())
print(reddit_500_lr_acc)


In [None]:
verbose=False
file_path = "/data/reddit/corpus-webis-tldr-17.json"
reddit_full_samples = np.zeros(5)
reddit_full_time    = np.zeros(5)
reddit_full_rf_acc  = np.zeros(5)
reddit_full_lr_acc  = np.zeros(5)

for i in range(5):
    start_time = time.time()

    spark_session, spark_context, fileSize = build_spark_session(hdfs_path, file_path, verbose=verbose)
    df = load_data(spark_session, hdfs_path, file_path, fileSize, verbose=verbose)
    train_data, test_data, no_samples = filter_and_split_data(df, verbose=verbose)
    pre_pipe = pre_processing_pipe()
    model_rf = random_forest(train_data, pre_pipe)
    model_lr = logistic_regression(train_data, pre_pipe)
    accuracy_rf = model_eval(model_rf, test_data, description="Random forest classifier", verbose=verbose)
    accuracy_lr = model_eval(model_lr, test_data, description="Logistic regression classifier", verbose=verbose)

    executor_cores = spark_session.conf.get("spark.executor.cores")
    executor_memory = spark_session.conf.get("spark.executor.memory")
    spark_context.stop()

    end_time = time.time()
    execution_time = end_time - start_time

    reddit_50_samples[i] = no_samples
    reddit_50_time[i]    = execution_time
    reddit_50_rf_acc[i]  = accuracy_rf
    reddit_50_lr_acc[i]  = accuracy_lr

    if verbose:
        print_results(file_path=file_path, fileSize=bytes_to_gb(fileSize), no_samples=no_samples, executor_cores=executor_cores, 
                    executor_memory=executor_memory, execution_time=execution_time, 
                    accuracy_rf=accuracy_rf, accuracy_lr=accuracy_lr)
            
print(reddit_full_samples.mean())
print(reddit_full_samples)

print(reddit_full_time.mean())
print(reddit_full_time)

print(reddit_full_rf_acc.mean())
print(reddit_full_rf_acc)

print(reddit_full_lr_acc.mean())
print(reddit_full_lr_acc)


In [191]:
spark_context.stop()