In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [('James','Smith','M',3000),
  ('Anna','Rose','F',4100),
  ('Robert','Williams','M',6200), 
]

columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()

#Example 1 mapPartitions()
def reformat(partitionData):
    for row in partitionData:
        yield [row.firstname+","+row.lastname,row.salary*10/100]
df.rdd.mapPartitions(reformat).toDF().show()

#Example 2 mapPartitions()
def reformat2(partitionData):
  updatedData = []
  for row in partitionData:
    name=row.firstname+","+row.lastname
    bonus=row.salary*10/100
    updatedData.append([name,bonus])
  return iter(updatedData)

df2=df.rdd.mapPartitions(reformat2).toDF(["name","bonus"])
df2.show()

+---------+--------+------+------+
|firstname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|  3000|
|     Anna|    Rose|     F|  4100|
|   Robert|Williams|     M|  6200|
+---------+--------+------+------+

+---------------+-----+
|             _1|   _2|
+---------------+-----+
|    James,Smith|300.0|
|      Anna,Rose|410.0|
|Robert,Williams|620.0|
+---------------+-----+

+---------------+-----+
|           name|bonus|
+---------------+-----+
|    James,Smith|300.0|
|      Anna,Rose|410.0|
|Robert,Williams|620.0|
+---------------+-----+



In [0]:
#Importing Libraries:

#The necessary library, pyspark.sql.SparkSession, is imported to create a SparkSession and work with DataFrames.
#Creating SparkSession:

#spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate() creates a SparkSession with the application name "SparkByExamples.com".
#Creating DataFrame:

#The code defines a list of tuples data and a list of column names columns to represent the data and schema, respectively.
#df = spark.createDataFrame(data=data, schema=columns) creates a DataFrame df using the provided data and schema.
#Displaying DataFrame:

#df.show() is used to display the data in the DataFrame df.
#Example 1: mapPartitions() with yield:

#The code defines a function reformat() that takes a partition of data as input.
#The function iterates over each row in the partition, concatenates the "firstname" and "lastname" columns, and calculates the bonus amount based on the "salary" column.
#The yield keyword is used to emit each modified row as a list.
#df.rdd.mapPartitions(reformat).toDF().show() applies the mapPartitions() transformation on the RDD of df using the reformat() function, converts the resulting RDD back to a DataFrame using toDF(), and displays the output.
#Example 2: mapPartitions() with list accumulation:

#The code defines a function reformat2() that takes a partition of data as input.
#The function initializes an empty list, updatedData.
#The function iterates over each row in the partition, concatenates the "firstname" and "lastname" columns, and calculates the bonus amount based on the "salary" column.
#The concatenated name and bonus amount are appended as a list to updatedData.
#The updated list updatedData is returned as an iterator using iter().
#df.rdd.mapPartitions(reformat2).toDF("name", "bonus").show() applies the mapPartitions() transformation on the RDD of df using the reformat2() function, converts the resulting RDD back to a DataFrame with specified column names using toDF(), and displays the output.
#Both examples illustrate how to use mapPartitions() to apply custom transformations on each partition of an RDD in PySpark.