In [1]:
import os
os.environ['SPARK_HOME'] = "/opt/spark"
os.environ['PYSPARK_DRIVER_PYTHON'] = "jupyter"
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = "lab"
os.environ['PYSPARK_PYTHON'] = "python"

In [2]:
from pyspark.sql import SparkSession

In [None]:
# Create a SparkSession
spark = SparkSession.builder.appName("RDD-Demo").getOrCreate()

### **How to create RDDs**

In [None]:
numbers = [1, 2, 3, 4, 5, 6]
rdd = spark.sparkContext.parallelize(numbers)

In [None]:
# Collect action : Retrieve all elements of the RDD
rdd.collect()

In [7]:
# Create en RDD from a list of tuples
data = [
    ("Alice", 25),
    ("Bob", 30),
    ("Charlie", 35)
]
rdd = spark.sparkContext.parallelize(data)

In [None]:
# Collect action : Retrieve all elements of the RDD
print("All elements of the rdd: ", rdd.collect())

# **RDDs Operation: Actions**

In [None]:
# Count action: Count the number of elements in the RDD
count = rdd.count()
print("The total number of elements in rdd: ", count)

In [None]:
# First action: Retrieve the first element of the RDD
first_element = rdd.first()
print("The first element of the rdd: ", first_element)

In [None]:
# Take action: Retrieve the first element of the RDD
taken_elements = rdd.take(2)
print("The first two elements of the rdd: ", taken_elements)

In [None]:
rdd.foreach(lambda x: print(x))

# **RDDs Operation: Transformations**

In [13]:
# Map transformation: Convert name to uppercase
mapped_rdd = rdd.map(lambda x: (x[0].upper(), x[1]))

In [None]:
result = mapped_rdd.collect()
print("rdd with uppercase name: ", result)

In [None]:
# Filter transformation: Filter records where age is greater than 30
filtered_rdd = rdd.filter(lambda x: x[1] > 30)
filtered_rdd.collect()

In [None]:
# ReduceByKey transformation: Calculate the total age for each name
reduced_rdd = rdd.reduceByKey(lambda x, y: x + y)
reduced_rdd.collect()

In [None]:
# SortBy transformation: Sort te RDD by age in descending order
sorted_rdd = rdd.sortBy(lambda x: x[1], ascending=False)
sorted_rdd.collect()

# **Save RDDs to text file and read RDDs from text file**

In [18]:
# Save action: Save the RDD to a text file
rdd.saveAsTextFile("output.txt")

                                                                                

In [None]:
# Create rdd from text file
rdd_text = spark.sparkContext.textFile("output.txt")
rdd.collect()