In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("RDDExamples") \
    .master("local[*]") \
    .getOrCreate()

# Get the SparkContext from the SparkSession
sc = spark.sparkContext

# Create an RDD from a Python list
nums = sc.parallelize([1, 2, 3, 4, 5])

# You could also create an RDD from a text file
# text_rdd = sc.textFile("/path/to/file.txt")

# Basic RDD operations
nums.collect()  # => [1, 2, 3, 4, 5]
nums.count()    # => 5
nums.take(3)    # => [1, 2, 3]


[1, 2, 3]

In [2]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("EvenSquares") \
    .master("local[*]") \
    .getOrCreate()

# Get the SparkContext
sc = spark.sparkContext

# Create an RDD from a list
nums = sc.parallelize([1, 2, 3, 4])

# Apply transformations (lazy)
squares = nums.map(lambda x: x * x)
even_squares = squares.filter(lambda x: x % 2 == 0)

# Trigger action to collect results
result = even_squares.collect()
print(result)  # Output: [4, 16]


[4, 16]


In [3]:
lines = sc.parallelize([
    "spark makes big data simple",
    "rdds are resilient distributed datasets",
    "spark runs fast"
])

word_counts = (lines
    .flatMap(lambda line: line.split())
    .map(lambda w: (w.lower(), 1)) # [(“spark”, 1), (“makes”, 1), (“big”, 1), ...]
    .reduceByKey(lambda a, b: a + b) # [(“spark”, 2), (“makes”, 1), (“big”, 1), ...]
)

word_counts.take(10) # sample results

[('big', 1),
 ('are', 1),
 ('resilient', 1),
 ('distributed', 1),
 ('datasets', 1),
 ('runs', 1),
 ('fast', 1),
 ('spark', 2),
 ('makes', 1),
 ('data', 1)]