**RDD Exercises – Set 2**

Setup

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RDD-Exercises-Set2").getOrCreate()
sc = spark.sparkContext


1. Numbers Practice

In [2]:
# Create an RDD with numbers 1–15
numbers_rdd = sc.parallelize(range(1, 16))

# Find all numbers divisible by 3
div_by_3 = numbers_rdd.filter(lambda x: x % 3 == 0).collect()

# Create a new RDD with each number doubled
doubled_rdd = numbers_rdd.map(lambda x: x * 2)

# Count how many numbers are greater than 10
greater_than_10_count = numbers_rdd.filter(lambda x: x > 10).count()

# Results
print("Divisible by 3:", div_by_3)
print("Doubled RDD:", doubled_rdd.collect())
print("Count > 10:", greater_than_10_count)


Divisible by 3: [3, 6, 9, 12, 15]
Doubled RDD: [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
Count > 10: 5


2. String Processing

In [3]:
fruits_rdd = sc.parallelize(["apple", "banana", "grape", "banana", "apple", "mango"])

# Distinct fruits
distinct_fruits = fruits_rdd.distinct().collect()

# Count how many times each fruit appears
fruit_counts = fruits_rdd.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b).collect()

# Find the longest word
longest_fruit = fruits_rdd.map(lambda x: (len(x), x)).max()[1]

# Results
print("Distinct fruits:", distinct_fruits)
print("Fruit counts:", fruit_counts)
print("Longest fruit:", longest_fruit)

Distinct fruits: ['apple', 'banana', 'grape', 'mango']
Fruit counts: [('apple', 2), ('banana', 2), ('grape', 1), ('mango', 1)]
Longest fruit: banana


3. Sentence Split

In [4]:
sentences_rdd = sc.parallelize([
    "spark makes big data easy",
    "rdd is the core of spark",
    "python with spark"
])

# Split into words using flatMap
words_rdd = sentences_rdd.flatMap(lambda line: line.split())

# Convert to lowercase and remove duplicates
unique_words = words_rdd.map(lambda word: word.lower()).distinct()

# Count total number of unique words
unique_word_count = unique_words.count()

# Results
print("Unique words:", unique_words.collect())
print("Total unique word count:", unique_word_count)

Unique words: ['big', 'easy', 'rdd', 'core', 'of', 'python', 'with', 'spark', 'makes', 'data', 'is', 'the']
Total unique word count: 12


4. Pair RDD Operations

In [5]:
marks_rdd = sc.parallelize([
    ("Rahul", 85), ("Priya", 92), ("Aman", 78),
    ("Rahul", 90), ("Priya", 88)
])

# Total marks per student
total_marks = marks_rdd.reduceByKey(lambda a, b: a + b).collect()

# Average marks per student
count_marks = marks_rdd.mapValues(lambda v: (v, 1)) \
    .reduceByKey(lambda a, b: (a[0]+b[0], a[1]+b[1])) \
    .mapValues(lambda v: v[0]/v[1]) \
    .collect()

# Student with highest individual marks
top_student = marks_rdd.max(key=lambda x: x[1])

# Results
print("Total marks per student:", total_marks)
print("Average marks per student:", count_marks)
print("Highest individual mark:", top_student)

Total marks per student: [('Rahul', 175), ('Priya', 180), ('Aman', 78)]
Average marks per student: [('Rahul', 87.5), ('Priya', 90.0), ('Aman', 78.0)]
Highest individual mark: ('Priya', 92)


5. Reduce & Aggregate

In [6]:
nums_rdd = sc.parallelize([5, 10, 15, 20, 25])

# Sum using reduce
total_sum = nums_rdd.reduce(lambda a, b: a + b)

# Product using reduce
product = nums_rdd.reduce(lambda a, b: a * b)

# Average manually
count = nums_rdd.count()
average = total_sum / count

# Results
print("Sum:", total_sum)
print("Product:", product)
print("Average:", average)


Sum: 75
Product: 375000
Average: 15.0


6. Word Length Analysis

In [7]:
words_rdd = sc.parallelize(["data", "engineering", "spark", "rdd", "pyspark", "analytics"])

# Map to (word, length)
word_lengths = words_rdd.map(lambda word: (word, len(word)))

# Find longest word
longest_word = word_lengths.max(key=lambda x: x[1])

# Average word length
total_chars = word_lengths.map(lambda x: x[1]).reduce(lambda a, b: a + b)
avg_length = total_chars / words_rdd.count()

# Results
print("Word lengths:", word_lengths.collect())
print("Longest word:", longest_word)
print("Average word length:", avg_length)


Word lengths: [('data', 4), ('engineering', 11), ('spark', 5), ('rdd', 3), ('pyspark', 7), ('analytics', 9)]
Longest word: ('engineering', 11)
Average word length: 6.5


7. Joins

In [8]:
students_rdd = sc.parallelize([(1, "Rahul"), (2, "Priya"), (3, "Aman")])
courses_rdd = sc.parallelize([(1, "Python"), (2, "Spark"), (4, "Databases")])

# Inner join
inner_join = students_rdd.join(courses_rdd).collect()

# Left outer join
left_outer = students_rdd.leftOuterJoin(courses_rdd).collect()

# Right outer join
right_outer = courses_rdd.rightOuterJoin(students_rdd).map(lambda x: (x[1][1], x[1][0])).collect()

# Results
print("Inner Join:", inner_join)
print("Left Outer Join:", left_outer)
print("Right Outer Join:", right_outer)


Inner Join: [(1, ('Rahul', 'Python')), (2, ('Priya', 'Spark'))]
Left Outer Join: [(1, ('Rahul', 'Python')), (2, ('Priya', 'Spark')), (3, ('Aman', None))]
Right Outer Join: [('Rahul', 'Python'), ('Priya', 'Spark'), ('Aman', None)]


8. Mini Real-World

In [9]:
orders_rdd = sc.parallelize([
    (1, 200), (2, 500), (3, 300), (1, 150), (2, 250)
])

# Total spend per customer
total_spend = orders_rdd.reduceByKey(lambda a, b: a + b).collect()

# Customer with maximum spend
max_customer = orders_rdd.reduceByKey(lambda a, b: a + b).max(key=lambda x: x[1])

# Total revenue from all customers
total_revenue = orders_rdd.map(lambda x: x[1]).reduce(lambda a, b: a + b)

# Results
print("Total spend per customer:", total_spend)
print("Customer with max spend:", max_customer)
print("Total revenue:", total_revenue)


Total spend per customer: [(2, 750), (1, 350), (3, 300)]
Customer with max spend: (2, 750)
Total revenue: 1400
