In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

Using map():
+---------+---------+
|UpperName|DoubleAge|
+---------+---------+
|    ALICE|       50|
|      BOB|       60|
|  CHARLIE|       70|
+---------+---------+

Using mapPartitions():
+---------+---------+
|UpperName|DoubleAge|
+---------+---------+
|    ALICE|       50|
|      BOB|       60|
|  CHARLIE|       70|
+---------+---------+

Using mapPartitionsWithIndex():
+--------------+---------+---------+
|PartitionIndex|UpperName|DoubleAge|
+--------------+---------+---------+
|             2|    ALICE|       50|
|             5|      BOB|       60|
|             7|  CHARLIE|       70|
+--------------+---------+---------+



In [0]:
def map_function(row):
    name, age = row
    return (name.upper(), age * 2)

# Use map() to transform each row
mapped_df = df.rdd.map(map_function).toDF(["UpperName", "DoubleAge"])
print("Using map():")
mapped_df.show()






In [0]:
# Function to transform a partition using mapPartitions()
def map_partitions_function(iter):
    for row in iter:
        yield (row[0].upper(), row[1] * 2)

# Use mapPartitions() to transform each partition
mapped_partitions_df = df.rdd.mapPartitions(map_partitions_function).toDF(["UpperName", "DoubleAge"])
# Show the result
print("Using mapPartitions():")
mapped_partitions_df.show()

In [0]:
def map_partitions_with_index_function(index, iter):
    for row in iter:
        yield (index, row[0].upper(), row[1] * 2)

mapped_partitions_with_index_df = df.rdd.mapPartitionsWithIndex(map_partitions_with_index_function).toDF(["PartitionIndex", "UpperName", "DoubleAge"])

mapped_partitions_with_index_df.show()



In [0]:
data = [1, 2, 3, 4, 5]
rdd = spark.sparkContext.parallelize(data)
mapped_rdd = rdd.map(lambda x: x ** 2)
print(mapped_rdd.collect())

[1, 4, 9, 16, 25]


In [0]:
data1 = [[1, 2], [3, 4, 5], [6]]
rdd2 = spark.sparkContext.parallelize(data1)
flat_mapped_rdd = rdd2.flatMap(lambda x: [item ** 2 for item in x])
print(flat_mapped_rdd.collect())

[1, 4, 9, 16, 25, 36]


In [0]:
data = [('James','Smith','M',3000),
  ('Anna','Rose','F',4100),
  ('Robert','Williams','M',6200), 
]

columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()

def reformat(partitionData):
    for row in partitionData:
        yield [row.firstname+","+row.lastname,row.salary*10/100]



+---------+--------+------+------+
|firstname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|  3000|
|     Anna|    Rose|     F|  4100|
|   Robert|Williams|     M|  6200|
+---------+--------+------+------+



In [0]:
df2=df.rdd.mapPartitions(reformat).toDF(["name","bonus"])
df2.show()

+---------------+-----+
|           name|bonus|
+---------------+-----+
|    James,Smith|300.0|
|      Anna,Rose|410.0|
|Robert,Williams|620.0|
+---------------+-----+



In [0]:
# RDD.map()
# RDD.flatMap()
# RDD.mapPartitions()
# RDD.mapPartitionsWithSplit()
# RDDBarrier.mapPartitionsWithIndex()