In [0]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

states = {"NY":"New York", "CA":"California", "FL":"Florida"}
broadcastStates = spark.sparkContext.broadcast(states)

data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]

rdd = spark.sparkContext.parallelize(data)

def state_convert(code):
    return broadcastStates.value[code]

result = rdd.map(lambda x: (x[0],x[1],x[2],state_convert(x[3]))).collect()
print(result)


[('James', 'Smith', 'USA', 'California'), ('Michael', 'Rose', 'USA', 'New York'), ('Robert', 'Williams', 'USA', 'California'), ('Maria', 'Jones', 'USA', 'Florida')]


In [0]:
#Importing Libraries: The code imports the necessary library, pyspark and SparkSession from pyspark.sql.

#Creating SparkSession: A Spark session is created using SparkSession.builder.appName('SparkByExamples.com').getOrCreate().

#Creating a Dictionary: A dictionary named states is created, which maps state codes to their corresponding names.

#Broadcasting States: The dictionary states is broadcasted using spark.sparkContext.broadcast(states). This allows efficient sharing of the data across all worker nodes.

#Creating RDD: An RDD named rdd is created from a list of tuples data using spark.sparkContext.parallelize(data).

#Defining state_convert Function: A function named state_convert is defined, which takes a state code as input and returns the corresponding state name using the broadcasted dictionary broadcastStates.

#Mapping and Collecting: The RDD rdd is mapped using a lambda function that applies the state_convert function to convert the state code to its name. The result is collected using the collect action and stored in the result variable.

#Printing the Result: The result list is printed, which contains tuples with the original data along with the converted state names.

#The code demonstrates how broadcasting can be used to efficiently distribute and utilize a shared read-only variable (states dictionary) across all the worker nodes in a Spark cluster.