## Setting up Spark Session / Context

In [None]:
import math
import time
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression


In [None]:
def bytes_to_mb(size_bytes):
    """Converts bytes to megabytes."""
    return size_bytes / (1024 * 1024)

def bytes_to_gb(size_bytes):
    """Converts bytes to gigabytes."""
    return size_bytes / (1024 * 1024 * 1024)

def configure_spark(dataset_size_gb):
    """Configures Spark based on dataset size.

    Args:
        dataset_size_gb (float): Size of the dataset in gigabytes.

    Returns:
        tuple: Executor cores and memory configuration.
    """
    if dataset_size_gb < 1:
        executor_cores = 2
        executor_memory = "4g"
    elif 1 <= dataset_size_gb <= 10:
        executor_cores = 4
        executor_memory = "8g"
    else:
        executor_cores = 8
        executor_memory = "16g"
    return executor_cores, executor_memory


def build_spark_session(hdfs_path, file_path, verbose=False):
    """Builds a Spark session and retrieves file size from HDFS.

    Args:
        hdfs_path (str): HDFS path.
        file_path (str): File path within HDFS.
        verbose (bool, optional): Enable verbose output. Defaults to False.

    Returns:
        tuple: SparkSession, SparkContext, and file size.
    """
    spark = SparkSession.builder.appName("Project Group 32 HDFSFileSize").getOrCreate()
    jvm = spark._jvm
    conf = jvm.org.apache.hadoop.conf.Configuration()
    fs = jvm.org.apache.hadoop.fs.FileSystem.get(jvm.java.net.URI.create(hdfs_path), conf)
    path = jvm.org.apache.hadoop.fs.Path(file_path)
    fileStatus = fs.getFileStatus(path)
    fileSize = fileStatus.getLen()

    if verbose:
        print(f"File size in bytes: {fileSize}")

    spark.stop()

    executor_cores, executor_memory = configure_spark(bytes_to_gb(fileSize))

    if verbose:
        print(f"A files size of {bytes_to_gb(fileSize):.4f} GB give spark executors with:\n"+
            f"Cores: {executor_cores}\n"+
            f"Mem/core: {int(executor_memory[:-1])/executor_cores:.0f}GB")


    spark_session = SparkSession.builder\
            .master("spark://192.168.2.156:7077") \
            .appName("Project Group 32 Andreas")\
            .config("spark.dynamicAllocation.enabled", True)\
            .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
            .config("spark.shuffle.service.enabled", False)\
            .config("spark.dynamicAllocation.executorIdleTimeout","60s")\
            .config("spark.executor.cores", executor_cores)\
            .config("spark.executor.memory", executor_memory)\
            .config("spark.driver.port",9999)\
            .config("spark.blockManager.port",10005)\
            .getOrCreate()

    # RDD API
    spark_context = spark_session.sparkContext
    spark_context.setLogLevel("ERROR")

    if verbose:
        print(f"Executor cores: {spark_session.conf.get('spark.executor.cores')}")
        print(f"Executor memory: {spark_session.conf.get('spark.executor.memory')}")

    return spark_session, spark_context, fileSize

## Create a dataframe to analyse the posts line by line

In [None]:
def load_data(spark_session, hdfs_path, file_path, fileSize, verbose=False):
    """Loads JSON data from HDFS into a Spark DataFrame.

    Args:
        spark_session (SparkSession): Spark session.
        hdfs_path (str): HDFS path.
        file_path (str): File path within HDFS.
        fileSize (int): Size of the file in bytes.
        verbose (bool, optional): Enable verbose output. Defaults to False.

    Returns:
        DataFrame: Loaded Spark DataFrame.
    """
    # Load JSON file into a Spark DataFrame
    df = spark_session.read.json(hdfs_path + file_path)

    if verbose:
        # Count the number of partitions in the underlying RDD.
        print(f"Number of default partitions after loading the data: {df.rdd.getNumPartitions()}")
        print("\n")

    # Repartition using "subreddit" as key.
    # The partition size matches the HDFS block size in MB.
    no_partitions = math.ceil(bytes_to_mb(fileSize) / 128)
    partition_key =  "subreddit"
    df.repartition(no_partitions, partition_key)
    if verbose:
        print(f"The data is now repartitoned on key: '{partition_key}', into {df.rdd.getNumPartitions()} partitions.")
        print("\n")

        # Show schema to understand the structure
        print("The schema:")
        df.printSchema()
        print("\n")

        # Show first few rows to inspect data
        print("The first five entries in the dataframe:")
        df.show(5, truncate=False)
        print("\n")

        # Count total number of rows
        print(f"Total Rows: {df.count()}")
    return df

## How many Subreddits do exist?

-- We see that many post are not assigned to a Subreddit, since we want to train a Classification model, we delete the NULL post --

In [None]:

def filter_and_split_data(df, seed=42, verbose=False):
    """Filters and splits the DataFrame into training and test sets.

    Args:
        df (DataFrame): Input DataFrame.
        seed (int, optional): Random seed for splitting. Defaults to 42.
        verbose (bool, optional): Enable verbose output. Defaults to False.

    Returns:
        tuple: Training DataFrame, test DataFrame, and filtered data count.
    """
    unique_subreddits = df.select("subreddit").distinct().count()
    if verbose:
        print(f"Unique Subreddits: {unique_subreddits}")
        df.groupBy("subreddit").count().orderBy(col("count").desc()).show(10, False)
    else:
        df.groupBy("subreddit").count().orderBy(col("count").desc())

    # Filter out rows where subreddit is NULL
    df_filtered = df.filter(col("subreddit").isNotNull())

    if verbose:
        # Show first few rows after filtering
        df_filtered.show(5, truncate=False)

        # Count remaining rows
        print(f"Total Posts After Filtering: {df_filtered.count()}")

    # Filter out NULL subreddit, summary, or content
    df_filtered = df.filter((col("subreddit").isNotNull()) & (col("summary").isNotNull()) & (col("content").isNotNull()))

    if verbose:
        # Show filtered data
        df_filtered.select("subreddit", "summary", "content").show(5, truncate=False)

    # Split data into training and test sets
    train_data, test_data = df_filtered.randomSplit([0.8, 0.2], seed=seed)

    if verbose:
        print(f"Training set contains {train_data.count()} samples\n" +
              f"Test set contains {test_data.count()} samples.")

    return train_data, test_data, df_filtered.count()

## To prepare the Data for our ML Classification Model, we use the columns summary and content

## Create the training and test datasets

## We have to make the Text understandable for the algorithm

1. We first tokenize the the columns
2. Remove stop words, since they do not add information to the text
3. We convert the Text with TF-IDF to numbers

In [None]:
def pre_processing_pipe():
    """
    Creates a pipeline for pre-processing text data for machine learning.

    This pipeline includes tokenization, stop word removal, TF-IDF vectorization,
    label indexing, and feature assembly.

    Returns:
        list: A list of Spark ML pipeline stages.
    """
    # Tokenize summary and content
    tokenizer  = Tokenizer(inputCol="summary", outputCol="summary_tokens")
    tokenizer2 = Tokenizer(inputCol="content", outputCol="content_tokens")

    # Remove stopwords
    stopwords_remover  = StopWordsRemover(inputCol="summary_tokens", outputCol="summary_clean")
    stopwords_remover2 = StopWordsRemover(inputCol="content_tokens", outputCol="content_clean")

    # Convert words to numerical features using TF-IDF
    hashing_tf = HashingTF(inputCol="summary_clean", outputCol="summary_tf", numFeatures=1000)
    idf = IDF(inputCol="summary_tf", outputCol="summary_features")

    hashing_tf2 = HashingTF(inputCol="content_clean", outputCol="content_tf", numFeatures=1000)
    idf2 = IDF(inputCol="content_tf", outputCol="content_features")

    # Convert subreddit (text label) into a numerical label
    label_indexer = StringIndexer(inputCol="subreddit", outputCol="label", handleInvalid="keep")

    # Combine summary and content features
    feature_assembler = VectorAssembler(inputCols=["summary_features", "content_features"], outputCol="features")

    # Return pre-processing pipeline.
    return [tokenizer, tokenizer2, stopwords_remover, stopwords_remover2,
            hashing_tf, idf, hashing_tf2, idf2, label_indexer, feature_assembler]

In [None]:

def model_eval(model, test_data, description="", verbose=False):
    """
    Evaluates a machine learning model's accuracy on test data.

    Args:
        model: The trained Spark ML model.
        test_data (DataFrame): The test dataset.
        description (str, optional): A description of the model for output. Defaults to "".
        verbose (bool, optional): Enable verbose output. Defaults to False.

    Returns:
        float: The accuracy of the model.
    """
    # Make predictions on test data
    predictions = model.transform(test_data)

    # Evaluate model accuracy
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    if verbose:
      print(f"Evaluation of {description}. \n"+
            f"Model Accuracy: {accuracy:.4f}")
    
    return accuracy


In [None]:


def random_forest(train_data, pre_pipe):
    """
    Trains a Random Forest classification model.

    Args:
        train_data (DataFrame): The training dataset.
        pre_pipe (list): List of pre-processing stages.

    Returns:
        PipelineModel: The trained Random Forest model.
    """
    # Define the Random Forest classifier
    classifier = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)

    # Create a new pipeline using Random Forest
    pipeline = Pipeline(stages= pre_pipe + [classifier])

    # Train the model
    model = pipeline.fit(train_data)

    # Save the trained model
    #model.save("hdfs://192.168.2.156:9000/data/reddit/model/reddit_text_classifier_rf")
    
    return model

In [None]:

def logistic_regression(train_data, pre_pipe):
    """
    Trains a Logistic Regression classification model.

    Args:
        train_data (DataFrame): The training dataset.
        pre_pipe (list): List of pre-processing stages.

    Returns:
        PipelineModel: The trained Logistic Regression model.
    """
    # Define the classification model
    classifier = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

    # Create a new pipeline using Logistic Regression
    pipeline = Pipeline(stages= pre_pipe + [classifier])

    # Train the model
    model = pipeline.fit(train_data)

    # Save the trained model
    # model.save("hdfs://192.168.2.156:9000/data/reddit/model/reddit_text_classifier")

    return model

In [None]:
def print_results(file_path, fileSize, no_samples, executor_cores, executor_memory, execution_time, accuracy_rf, accuracy_lr):
    """
    Prints performance and evaluation results.

    Args:
        file_path (str): Path of the input file.
        fileSize (float): Size of the input file in GB.
        no_samples (int): Number of samples processed.
        executor_cores (int): Number of executor cores.
        executor_memory (str): Executor memory configuration.
        execution_time (float): Total execution time in seconds.
        accuracy_rf (float): Accuracy of the Random Forest model.
        accuracy_lr (float): Accuracy of the Logistic Regression model.
    """
    print("-" * 80)
    print("Spark Processing and Model Evaluation Results")
    print("-" * 80)

    print(f"File Path:        {file_path}")
    print(f"File Size:        {fileSize:.2f} GB")
    print(f"No samples:       {no_samples}")
    print(f"Executor Cores:   {executor_cores}")
    print(f"Executor Memory:  {executor_memory}")

    print("-" * 80)
    print("Performance Metrics:")
    print("-" * 80)

    print(f"Execution Time:   {execution_time:.2f} seconds")

    print("-" * 80)
    print("Model Accuracy:")
    print("-" * 80)

    print(f"Random Forest Accuracy:     {accuracy_rf:.4f}")
    print(f"Logistic Regression Accuracy: {accuracy_lr:.4f}")

    print("-" * 80)

In [259]:
hdfs_path = "hdfs://192.168.2.156:9000"


In [248]:
verbose=False
file_path = "/data/reddit/reddit_50k.json"
reddit_50_samples = np.zeros(5)
reddit_50_time    = np.zeros(5)
reddit_50_rf_acc  = np.zeros(5)
reddit_50_lr_acc  = np.zeros(5)

for i in range(5):
    print(f"Run: {i}")
    start_time = time.time()

    spark_session, spark_context, fileSize = build_spark_session(hdfs_path, file_path, verbose=verbose)
    df = load_data(spark_session, hdfs_path, file_path, fileSize, verbose=verbose)
    train_data, test_data, no_samples = filter_and_split_data(df, verbose=verbose)
    pre_time = time.time()
    pre_time = pre_time - start_time
    print(f"Pre-processing time: {pre_time:.4f}")

    pre_pipe = pre_processing_pipe()
    model_rf = random_forest(train_data, pre_pipe)
    rf_time  = time.time()
    rf_time  = rf_time - start_time - pre_time
    accuracy_rf = model_eval(model_rf, test_data, description="Random forest classifier")
    print(f"Random forest time: {rf_time:.4f}")

    """ pre_pipe = pre_processing_pipe()
    model_lr = logistic_regression(train_data, pre_pipe)
    lr_time  = time.time()
    lr_time  = lr_time - start_time - rf_time
    accuracy_lr = model_eval(model_lr, test_data, description="Logistic regression classifier")
    print(f"Logistic regression time: {lr_time:.4f}") """
    accuracy_lr = 0

    executor_cores = spark_session.conf.get("spark.executor.cores")
    executor_memory = spark_session.conf.get("spark.executor.memory")

    spark_context.stop()

    end_time = time.time()
    execution_time = end_time - start_time

    reddit_50_samples[i] = no_samples
    reddit_50_time[i]    = execution_time
    reddit_50_rf_acc[i]  = accuracy_rf
    reddit_50_lr_acc[i]  = accuracy_lr

    
    print_results(file_path=file_path, fileSize=bytes_to_gb(fileSize), no_samples=no_samples, executor_cores=executor_cores, 
                executor_memory=executor_memory, execution_time=execution_time, 
                accuracy_rf=accuracy_rf, accuracy_lr=accuracy_lr)
            
print(reddit_50_samples.mean())
print(reddit_50_samples)

print(reddit_50_time.mean())
print(reddit_50_time)

print(reddit_50_rf_acc.mean())
print(reddit_50_rf_acc)

print(reddit_50_lr_acc.mean())
print(reddit_50_lr_acc)


Run: 0


                                                                                

Pre-processing time: 16.5790


                                                                                

Random forest time: 192.5081
--------------------------------------------------------------------------------
Spark Processing and Model Evaluation Results
--------------------------------------------------------------------------------
File Path:        /data/reddit/reddit_50k.json
File Size:        0.37 GB
No samples:       49817
Executor Cores:   2
Executor Memory:  4g
--------------------------------------------------------------------------------
Performance Metrics:
--------------------------------------------------------------------------------
Execution Time:   229.44 seconds
--------------------------------------------------------------------------------
Model Accuracy:
--------------------------------------------------------------------------------
Random Forest Accuracy:     0.2331
Logistic Regression Accuracy: 0.0000
--------------------------------------------------------------------------------
Run: 1


                                                                                

Pre-processing time: 15.9403


                                                                                

Random forest time: 191.6205
--------------------------------------------------------------------------------
Spark Processing and Model Evaluation Results
--------------------------------------------------------------------------------
File Path:        /data/reddit/reddit_50k.json
File Size:        0.37 GB
No samples:       49817
Executor Cores:   2
Executor Memory:  4g
--------------------------------------------------------------------------------
Performance Metrics:
--------------------------------------------------------------------------------
Execution Time:   227.44 seconds
--------------------------------------------------------------------------------
Model Accuracy:
--------------------------------------------------------------------------------
Random Forest Accuracy:     0.2331
Logistic Regression Accuracy: 0.0000
--------------------------------------------------------------------------------
Run: 2


                                                                                

Pre-processing time: 15.8499




Random forest time: 189.7341


                                                                                

--------------------------------------------------------------------------------
Spark Processing and Model Evaluation Results
--------------------------------------------------------------------------------
File Path:        /data/reddit/reddit_50k.json
File Size:        0.37 GB
No samples:       49817
Executor Cores:   2
Executor Memory:  4g
--------------------------------------------------------------------------------
Performance Metrics:
--------------------------------------------------------------------------------
Execution Time:   226.42 seconds
--------------------------------------------------------------------------------
Model Accuracy:
--------------------------------------------------------------------------------
Random Forest Accuracy:     0.2331
Logistic Regression Accuracy: 0.0000
--------------------------------------------------------------------------------
Run: 3


                                                                                

Pre-processing time: 16.1131


                                                                                

Random forest time: 191.2340
--------------------------------------------------------------------------------
Spark Processing and Model Evaluation Results
--------------------------------------------------------------------------------
File Path:        /data/reddit/reddit_50k.json
File Size:        0.37 GB
No samples:       49817
Executor Cores:   2
Executor Memory:  4g
--------------------------------------------------------------------------------
Performance Metrics:
--------------------------------------------------------------------------------
Execution Time:   227.42 seconds
--------------------------------------------------------------------------------
Model Accuracy:
--------------------------------------------------------------------------------
Random Forest Accuracy:     0.2331
Logistic Regression Accuracy: 0.0000
--------------------------------------------------------------------------------
Run: 4


                                                                                

Pre-processing time: 16.4119


                                                                                

Random forest time: 192.4618
--------------------------------------------------------------------------------
Spark Processing and Model Evaluation Results
--------------------------------------------------------------------------------
File Path:        /data/reddit/reddit_50k.json
File Size:        0.37 GB
No samples:       49817
Executor Cores:   2
Executor Memory:  4g
--------------------------------------------------------------------------------
Performance Metrics:
--------------------------------------------------------------------------------
Execution Time:   229.44 seconds
--------------------------------------------------------------------------------
Model Accuracy:
--------------------------------------------------------------------------------
Random Forest Accuracy:     0.2331
Logistic Regression Accuracy: 0.0000
--------------------------------------------------------------------------------
49817.0
[49817. 49817. 49817. 49817. 49817.]
228.0314193725586
[229.43670726 2

In [261]:
verbose=False
file_path = "/data/reddit/reddit_100k.json"
reddit_100_samples = np.zeros(5)
reddit_100_time    = np.zeros(5)
reddit_100_rf_acc  = np.zeros(5)
reddit_100_lr_acc  = np.zeros(5)

for i in range(5):
    print(f"Run: {i}")
    start_time = time.time()

    spark_session, spark_context, fileSize = build_spark_session(hdfs_path, file_path, verbose=verbose)
    df = load_data(spark_session, hdfs_path, file_path, fileSize, verbose=verbose)
    train_data, test_data, no_samples = filter_and_split_data(df, verbose=verbose)
    pre_time = time.time()
    pre_time = pre_time - start_time
    print(f"Pre-processing time: {pre_time:.4f}")

    pre_pipe = pre_processing_pipe()
    model_rf = random_forest(train_data, pre_pipe)
    rf_time  = time.time()
    rf_time  = rf_time - start_time - pre_time
    accuracy_rf = model_eval(model_rf, test_data, description="Random forest classifier")
    print(f"Random forest time: {rf_time:.4f}")

    """ pre_pipe = pre_processing_pipe()
    model_lr = logistic_regression(train_data, pre_pipe)
    lr_time  = time.time()
    lr_time  = lr_time - start_time - rf_time
    accuracy_lr = model_eval(model_lr, test_data, description="Logistic regression classifier")
    print(f"Logistic regression time: {lr_time:.4f}") """
    accuracy_lr = 0

    executor_cores = spark_session.conf.get("spark.executor.cores")
    executor_memory = spark_session.conf.get("spark.executor.memory")
    spark_context.stop()

    end_time = time.time()
    execution_time = end_time - start_time

    reddit_50_samples[i] = no_samples
    reddit_50_time[i]    = execution_time
    reddit_50_rf_acc[i]  = accuracy_rf
    reddit_50_lr_acc[i]  = accuracy_lr

    print_results(file_path=file_path, fileSize=bytes_to_gb(fileSize), no_samples=no_samples, executor_cores=executor_cores, 
                executor_memory=executor_memory, execution_time=execution_time, 
                accuracy_rf=accuracy_rf, accuracy_lr=accuracy_lr)
            
print(reddit_100_samples.mean())
print(reddit_100_samples)

print(reddit_100_time.mean())
print(reddit_100_time)

print(reddit_100_rf_acc.mean())
print(reddit_100_rf_acc)

print(reddit_100_lr_acc.mean())
print(reddit_100_lr_acc)


Run: 0


                                                                                

Pre-processing time: 79.9350


                                                                                

Py4JJavaError: An error occurred while calling o19655.evaluate.
: java.lang.OutOfMemoryError: Java heap space


In [None]:
verbose=False
file_path = "/data/reddit/reddit_200k.json"
reddit_200_samples = np.zeros(5)
reddit_200_time    = np.zeros(5)
reddit_200_rf_acc  = np.zeros(5)
reddit_200_lr_acc  = np.zeros(5)

for i in range(5):
    print(f"Run: {i}")
    start_time = time.time()

    spark_session, spark_context, fileSize = build_spark_session(hdfs_path, file_path, verbose=verbose)
    df = load_data(spark_session, hdfs_path, file_path, fileSize, verbose=verbose)
    train_data, test_data, no_samples = filter_and_split_data(df, verbose=verbose)
    pre_time = time.time()
    pre_time = pre_time - start_time
    print(f"Pre-processing time: {pre_time:.4f}")

    pre_pipe = pre_processing_pipe()
    model_rf = random_forest(train_data, pre_pipe)
    rf_time  = time.time()
    rf_time  = rf_time - start_time - pre_time
    accuracy_rf = model_eval(model_rf, test_data, description="Random forest classifier")
    print(f"Random forest time: {rf_time:.4f}")

    """ pre_pipe = pre_processing_pipe()
    model_lr = logistic_regression(train_data, pre_pipe)
    lr_time  = time.time()
    lr_time  = lr_time - start_time - rf_time
    accuracy_lr = model_eval(model_lr, test_data, description="Logistic regression classifier")
    print(f"Logistic regression time: {lr_time:.4f}") """
    accuracy_lr = 0

    executor_cores = spark_session.conf.get("spark.executor.cores")
    executor_memory = spark_session.conf.get("spark.executor.memory")
    spark_context.stop()

    end_time = time.time()
    execution_time = end_time - start_time

    reddit_50_samples[i] = no_samples
    reddit_50_time[i]    = execution_time
    reddit_50_rf_acc[i]  = accuracy_rf
    reddit_50_lr_acc[i]  = accuracy_lr

    print_results(file_path=file_path, fileSize=bytes_to_gb(fileSize), no_samples=no_samples, executor_cores=executor_cores, 
                executor_memory=executor_memory, execution_time=execution_time, 
                accuracy_rf=accuracy_rf, accuracy_lr=accuracy_lr)
            
print(reddit_200_samples.mean())
print(reddit_200_samples)

print(reddit_200_time.mean())
print(reddit_200_time)

print(reddit_200_rf_acc.mean())
print(reddit_200_rf_acc)

print(reddit_200_lr_acc.mean())
print(reddit_200_lr_acc)


In [None]:
verbose=False
file_path = "/data/reddit/reddit_500k.json"
reddit_500_samples = np.zeros(5)
reddit_500_time    = np.zeros(5)
reddit_500_rf_acc  = np.zeros(5)
reddit_500_lr_acc  = np.zeros(5)

for i in range(5):
    print(f"Run: {i}")
    start_time = time.time()

    spark_session, spark_context, fileSize = build_spark_session(hdfs_path, file_path, verbose=verbose)
    df = load_data(spark_session, hdfs_path, file_path, fileSize, verbose=verbose)
    train_data, test_data, no_samples = filter_and_split_data(df, verbose=verbose)
    pre_time = time.time()
    pre_time = pre_time - start_time
    print(f"Pre-processing time: {pre_time:.4f}")

    pre_pipe = pre_processing_pipe()
    model_rf = random_forest(train_data, pre_pipe)
    rf_time  = time.time()
    rf_time  = rf_time - start_time - pre_time
    accuracy_rf = model_eval(model_rf, test_data, description="Random forest classifier")
    print(f"Random forest time: {rf_time:.4f}")

    """ pre_pipe = pre_processing_pipe()
    model_lr = logistic_regression(train_data, pre_pipe)
    lr_time  = time.time()
    lr_time  = lr_time - start_time - rf_time
    accuracy_lr = model_eval(model_lr, test_data, description="Logistic regression classifier")
    print(f"Logistic regression time: {lr_time:.4f}") """
    accuracy_lr = 0

    executor_cores = spark_session.conf.get("spark.executor.cores")
    executor_memory = spark_session.conf.get("spark.executor.memory")
    spark_context.stop()

    end_time = time.time()
    execution_time = end_time - start_time

    reddit_50_samples[i] = no_samples
    reddit_50_time[i]    = execution_time
    reddit_50_rf_acc[i]  = accuracy_rf
    reddit_50_lr_acc[i]  = accuracy_lr

    print_results(file_path=file_path, fileSize=bytes_to_gb(fileSize), no_samples=no_samples, executor_cores=executor_cores, 
                executor_memory=executor_memory, execution_time=execution_time, 
                accuracy_rf=accuracy_rf, accuracy_lr=accuracy_lr)
            
print(reddit_500_samples.mean())
print(reddit_500_samples)

print(reddit_500_time.mean())
print(reddit_500_time)

print(reddit_500_rf_acc.mean())
print(reddit_500_rf_acc)

print(reddit_500_lr_acc.mean())
print(reddit_500_lr_acc)


In [None]:
verbose=False
file_path = "/data/reddit/corpus-webis-tldr-17.json"
reddit_full_samples = np.zeros(5)
reddit_full_time    = np.zeros(5)
reddit_full_rf_acc  = np.zeros(5)
reddit_full_lr_acc  = np.zeros(5)

for i in range(5):
    print(f"Run: {i}")
    start_time = time.time()

    spark_session, spark_context, fileSize = build_spark_session(hdfs_path, file_path, verbose=verbose)
    df = load_data(spark_session, hdfs_path, file_path, fileSize, verbose=verbose)
    train_data, test_data, no_samples = filter_and_split_data(df, verbose=verbose)
    pre_time = time.time()
    pre_time = pre_time - start_time
    print(f"Pre-processing time: {pre_time:.4f}")

    pre_pipe = pre_processing_pipe()
    model_rf = random_forest(train_data, pre_pipe)
    rf_time  = time.time()
    rf_time  = rf_time - start_time - pre_time
    accuracy_rf = model_eval(model_rf, test_data, description="Random forest classifier")
    print(f"Random forest time: {rf_time:.4f}")

    """ pre_pipe = pre_processing_pipe()
    model_lr = logistic_regression(train_data, pre_pipe)
    lr_time  = time.time()
    lr_time  = lr_time - start_time - rf_time
    accuracy_lr = model_eval(model_lr, test_data, description="Logistic regression classifier")
    print(f"Logistic regression time: {lr_time:.4f}") """
    accuracy_lr = 0

    executor_cores = spark_session.conf.get("spark.executor.cores")
    executor_memory = spark_session.conf.get("spark.executor.memory")
    spark_context.stop()

    end_time = time.time()
    execution_time = end_time - start_time

    reddit_50_samples[i] = no_samples
    reddit_50_time[i]    = execution_time
    reddit_50_rf_acc[i]  = accuracy_rf
    reddit_50_lr_acc[i]  = accuracy_lr

    print_results(file_path=file_path, fileSize=bytes_to_gb(fileSize), no_samples=no_samples, executor_cores=executor_cores, 
                executor_memory=executor_memory, execution_time=execution_time, 
                accuracy_rf=accuracy_rf, accuracy_lr=accuracy_lr)
            
print(reddit_full_samples.mean())
print(reddit_full_samples)

print(reddit_full_time.mean())
print(reddit_full_time)

print(reddit_full_rf_acc.mean())
print(reddit_full_rf_acc)

print(reddit_full_lr_acc.mean())
print(reddit_full_lr_acc)


In [262]:
spark_context.stop()