# Example 1: Immutable RDD

In [1]:
from pyspark import SparkContext

# Create a SparkContext instance
sc = SparkContext("local", "ExampleApp")

In [2]:
# Test the Spark immutable RDD
numbers = [1, 2, 3, 4, 5]
numbers_rdd = sc.parallelize(numbers)
print(f"Original RDD ID: {numbers_rdd.id()}")

Original RDD ID: 0


In [3]:
# Apply a transformation: multiply each number by 2
transformed_rdd = numbers_rdd.map(lambda x: x * 2)
print(f"Original RDD ID: {transformed_rdd.id()}")

Original RDD ID: 1


In [4]:
# Collect the results to trigger the computation (Transformation)
result = transformed_rdd.collect()
print(f"Transformed RDD Result: {result}")

Transformed RDD Result: [2, 4, 6, 8, 10]


# Example 2: Text Manipulation RDD

In [5]:
text = ["Hello Spark", "Hello Scala", "Hello World"]
text_rdd = sc.parallelize(text)
print(f"Original Text RDD Result: {text_rdd.take(10)}")

Original Text RDD Result: ['Hello Spark', 'Hello Scala', 'Hello World']


In [6]:
words_rdd = text_rdd.flatMap(lambda line: line.split(" "))
print(f"Words RDD Result: {words_rdd.take(10)}")

Words RDD Result: ['Hello', 'Spark', 'Hello', 'Scala', 'Hello', 'World']


In [7]:
upper_words_rdd = words_rdd.map(lambda word: word.upper())
print(f"Upper Words RDD Result: {upper_words_rdd.take(10)}")

Upper Words RDD Result: ['HELLO', 'SPARK', 'HELLO', 'SCALA', 'HELLO', 'WORLD']


# Aggregation: groupByKey Vs. reduceByKey

In [8]:
# Example 3: Group By Transformation
pairs_rdd = sc.parallelize([("A", 1), ("B", 1), ("A", 2), ("B", 1), ("A", 3)] * 5000000)
print(f"Original Pairs RDD Result: {pairs_rdd.take(10)}")

Original Pairs RDD Result: [('A', 1), ('B', 1), ('A', 2), ('B', 1), ('A', 3), ('A', 1), ('B', 1), ('A', 2), ('B', 1), ('A', 3)]


In [9]:
import time

# Measure the performance of groupByKey and sum
start_time = time.time()

grouped_rdd = pairs_rdd.groupByKey().mapValues(lambda values: sum(values))
grouped_result = grouped_rdd.collect()
group_by_key_duration = time.time() - start_time

print(f"groupByKey duration: {group_by_key_duration:.4f} seconds")
print(f"Grouped RDD Result (sum): {grouped_result[:10]}")

groupByKey duration: 6.9552 seconds
Grouped RDD Result (sum): [('A', 30000000), ('B', 10000000)]


In [10]:
# Measure the performance of reduceByKey and sum
reduce_start_time = time.time()

reduced_rdd = pairs_rdd.reduceByKey(lambda x, y: x + y)
reduced_result = reduced_rdd.collect()
reduce_by_key_duration = time.time() - reduce_start_time

print(f"reduceByKey duration: {reduce_by_key_duration:.4f} seconds")
print(f"Reduced RDD Result (sum): {reduced_result[:10]}")

reduceByKey duration: 5.6028 seconds
Reduced RDD Result (sum): [('A', 30000000), ('B', 10000000)]
