In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import FloatType
import time
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download the vader lexicon for sentiment analysis
nltk.download('vader_lexicon')

# Initialize the Sentiment Intensity Analyzer
sid = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    # Calculate the sentiment score of a text and return the compound score
    sentiment_score = sid.polarity_scores(text)
    return sentiment_score['compound']

# Create a UDF for sentiment analysis
sentiment_udf = udf(analyze_sentiment, FloatType())

# Initialize results list and parameters for the simulation
results = []
chunk_amounts = [100, 600, 3600]  # Different sizes of data to process
executor_cores_options = [1, 2]  # Number of cores per executor
executor_nodes_options = [1, 2]  # Number of executor nodes
iterations = 2  # Number of iterations per configuration

for datasize in chunk_amounts:
    # Generate HDFS file paths based on the data size
    file_paths = [f"hdfs://master:9000/path/in/hdfs/JsonFiles/corpus-webis-shortened_chunk{i}.json" for i in range(1, datasize + 1)]
    
    for executor_nodes in executor_nodes_options:
        for executor_cores in executor_cores_options:
            # Initialize Spark session with specified configuration
            spark_session = SparkSession.builder\
                           .appName(f"STRONG_SCAL_DF_cores_{executor_cores}_instances_{executor_nodes}_datasize_{datasize}")\
                           .master("spark://192.168.2.230:7077")\
                           .config("spark.dynamicAllocation.enabled", True)\
                           .config("spark.executor.instances", executor_nodes)\
                           .config("spark.executor.cores", executor_cores)\
                           .config("spark.dynamicAllocation.minExecutors","1")\
                           .config("spark.dynamicAllocation.maxExecutors","1")\
                           .config("spark.driver.port", 9999)\
                           .config("spark.blockManager.port", 10005)\
                           .getOrCreate()

            # Read and cache the data for performance
            df = spark_session.read.option("multiline", "true").json(file_paths).cache()
            
            # Apply sentiment analysis to the content column and cache the result
            df_with_sentiment = df.withColumn("sentiment_score", sentiment_udf(col("content"))).cache()

            for iteration in range(iterations):
                start_time = time.time()
                
                # Count positive and negative coments using the cached dataframe:
                positive_comments = df_with_sentiment.filter(col("sentiment_score") > 0).count()
                negative_comments = df_with_sentiment.filter(col("sentiment_score") < 0).count()
                
                end_time = time.time()
                processing_time = end_time - start_time
                
                # Collect the results for analysis
                results.append({
                    'Workers': executor_nodes,
                    'Cores/W': executor_cores,
                    'Total Cores': executor_cores * executor_nodes,
                    'Chunks': datasize,
                    'Iteration': iteration + 1,
                    'Time (s)': processing_time,
                    'Positive Comments': positive_comments,
                    'Negative Comments': negative_comments
                })
            
            # Cleanup and stop spark sesion
            df.unpersist()
            df_with_sentiment.unpersist()
            spark_session.stop()

# Convert results to a pandas DataFrame and print
results_df = pd.DataFrame(results)
print(results_df)
