In [1]:
# Assuming the sentiment_udf is already defined/imported
# from pyspark.sql.functions import udf
# sentiment_udf = udf(<your sentiment analysis function>, <return type>)
# 68 Start


from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, split, array_contains
import time
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

results = []
chunk_amounts = [100, 600, 3600]
executor_cores_options = [1, 2]
executor_nodes_options = [1, 2]
iterations = 2

nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    sentiment_score = sid.polarity_scores(text)
    return sentiment_score['compound']

sentiment_udf = udf(analyze_sentiment, FloatType())


for datasize in chunk_amounts:
    for executor_nodes in executor_nodes_options:
        for executor_cores in executor_cores_options:
            # Initialize Spark session with specified configuration
            spark_session = SparkSession.builder\
                            .appName(f"DF_cores_{executor_cores}_instances_{executor_nodes}_datasize_{datasize}")\
                            .master("spark://192.168.2.230:7077")\
                            .config("spark.dynamicAllocation.enabled", "true")\
                            .config("spark.executor.instances", executor_nodes)\
                            .config("spark.executor.cores", executor_cores)\
                            .getOrCreate()
            
            # Keep the data size constant across all configurations
            file_paths = [f"hdfs://master:9000/path/in/hdfs/JsonFiles/corpus-webis-shortened_chunk{i}.json" for i in range(1, datasize + 1)]
            df = spark_session.read.option("multiline", "true").json(file_paths)
            
            for iteration in range(iterations):
                start_time = time.time()
                
                # Assuming sentiment_udf is defined elsewhere to analyze sentiment
                df_with_sentiment = df.withColumn("sentiment_score", sentiment_udf(col("content")))
                
                # Count positive and negative comments based on sentiment scores
                positive_comments = df_with_sentiment.filter(col("sentiment_score") > 0).count()
                negative_comments = df_with_sentiment.filter(col("sentiment_score") < 0).count()
                
                end_time = time.time()
                processing_time = end_time - start_time
                
                # Append the results to your results list
                results.append({
                    'Workers': executor_nodes,
                    'Cores/W': executor_cores,
                    'Total Cores': executor_cores * executor_nodes,
                    'Chunks': datasize,
                    'Iteration': iteration + 1,
                    'Time (s)': processing_time,
                    'Positive Comments': positive_comments,
                    'Negative Comments': negative_comments
                })
            
            # Stop the Spark session to release resources before the next iteration
            spark_session.stop()

# Convert the collected results into a pandas DataFrame for analysis
results_df = pd.DataFrame(results)
print(results_df)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/14 17:34:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/14 17:35:43 ERROR DAGScheduler: Failed to update accumulator 0 (org.apache.spark.api.python.PythonAccumulatorV2) for task 3
org.apache.spark.SparkException: EOF reached before Python server acknowledged
	at org.apache.spark.api.python.PythonAccumulatorV2.merge(PythonRDD.scala:751)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$updateAccumulators$1(DAGScheduler.scala:1694)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$updateAccumulators$1$adapted(DAGScheduler.scala:1685)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at sc

    Workers  Cores/W  Total Cores  Chunks  Iteration    Time (s)  \
0         1        1            1     100          1   19.191793   
1         1        1            1     100          2    9.470902   
2         1        2            2     100          1   13.313256   
3         1        2            2     100          2    8.719398   
4         2        1            2     100          1   13.790897   
5         2        1            2     100          2    9.188600   
6         2        2            4     100          1   11.263053   
7         2        2            4     100          2    8.775099   
8         1        1            1     600          1   50.276575   
9         1        1            1     600          2   47.281941   
10        1        2            2     600          1   48.976984   
11        1        2            2     600          2   45.683618   
12        2        1            2     600          1   50.293894   
13        2        1            2     600       