In [None]:
#################################################################################################################################################
# In the weak scaling tests, the work for each core is kept constant while the total data is increase with the number of cores                  #
# The code below does 12*iterations tests. there are 3 diffrent probelm sizes for the cluster and 4 diffrent configurations of the resources.   #
# 1 worker 1 core                                                                                                                               #
# 1 worker 2 cores                                                                                                                              #
# 2 workers 1 core each                                                                                                                         #    
# 2 worker 2 cores each                                                                                                                         #
# Given the problem size the code test all the configurations iterations time                                                                   #
# The data of the analysis is stored in results_df                                                                                              #
#################################################################################################################################################

# Imports
!pip install pandas
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, split, array_contains
import time
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

# Parameter for the run
chunk_amounts = [100, 300, 600] # Number of files to be used in testing (each file is 100 comments)
executor_nodes_options = [1, 2] # Number of workers
executor_cores_options = [1, 2] # Number of cores per worker
iterations = 2 # Tests per above setting

# Downloading lexicon to be used for sentiment analysis
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    sentiment_score = sid.polarity_scores(text)
    return sentiment_score['compound']

sentiment_udf = udf(analyze_sentiment, FloatType())
results = []

#Test matrix. Tests every chossen number of chunks with every number of workers and cores per worker setting chosen
for datasize in chunk_amounts:
   for executor_nodes in executor_nodes_options:
       for executor_cores in executor_cores_options:
           #Sets up a sparksession for given test
           spark_session = SparkSession.builder\
                           .appName(f"DF_cores_{executor_cores}_instances_{executor_nodes}_datasize_{datasize*executor_nodes*executor_cores}")\
                           .master("spark://192.168.2.230:7077")\
                           .config("spark.dynamicAllocation.enabled", True)\
                           .config("spark.executor.instances", executor_nodes)\
                           .config("spark.executor.cores", executor_cores)\
                           .config("spark.dynamicAllocation.minExecutors","1")\
                           .config("spark.dynamicAllocation.maxExecutors","1")\
                           .config("spark.driver.port", 9999)\
                           .config("spark.blockManager.port", 10005)\
                           .getOrCreate()
           #Loads in all the files
           file_paths = [f"hdfs://master:9000/path/in/hdfs/JsonFiles/corpus-webis-shortened_chunk{i}.json" for i in range(1, datasize*executor_nodes*executor_cores + 1)]
           df = spark_session.read.option("multiline", "true").json(file_paths)

           #Executing the tests 
           for iteration in range(iterations):
                   start_time = time.time()
                   df_with_sentiment = df.withColumn("sentiment_score", sentiment_udf(df["content"]))
                   positive_comments = df_with_sentiment.filter(col("sentiment_score") > 0).count()
                   negative_comments = df_with_sentiment.filter(col("sentiment_score") < 0).count()
                   end_time = time.time()
                   processing_time = end_time - start_time
                   results.append({
                       'Workers': executor_nodes,
                       'Cores/W': executor_cores,
                       'Total Cores': executor_cores*executor_nodes,
                       'Chunks': datasize*executor_nodes*executor_cores,
                       'Iteration': iteration + 1,
                       'Time (s)': processing_time,
                       'Positive Comments': positive_comments,
                       'Negative Comments': negative_comments
                       
                   })
           print("Session complete")
           spark_session.stop()
           
# Convert results to a pandas DataFrame for easy tabular display
results_df = pd.DataFrame(results)
print(results_df)

In [19]:
spark_session.stop()