In [0]:
df=(spark
    .read
    .format("csv")
    .option("header","true")
    .option("inferSchema","true")
    .option("delimiter","|")
    .load("/FileStore/tables/uspopulation.csv"))
display(df)

2019_rank,City,State_Code,2019_estimate,2010_Census,Change
1,New York[d],NY,8336817,8175133,0.0198
2,Los Angeles,CA,3979576,3792621,0.0493
3,Chicago,IL,2693976,2695598,−0.06%
4,Houston[3],TX,2320268,2100263,0.1048
5,Phoenix,AZ,1680992,1445632,0.1628
6,San Antonio,TX,1547253,1327407,0.1656
7,San Diego,CA,1423851,1307402,0.0891
8,Dallas,TX,1343573,1197816,0.1217
9,San Jose,CA,1021795,945942,0.0802
10,Austin,TX,978908,790390,0.2385


User Defined Functions

In [0]:
lookup ={
    "NY":"New York",
    "CA":"California",
    "IL":"Illinois",
    "TX":"Texas",
    "AZ":"Arizona",
    "CO":"Colorado",
    "OH":"Ohio"
}
@udf
def map_state_code(state_code):
    if(state_code in lookup.keys()):
        return lookup[state_code]
    else:
        return state_code

In [0]:
df=df.withColumn("State_Udf", map_state_code("State_Code"))
display(df)

2019_rank,City,State_Code,2019_estimate,2010_Census,Change,State_Udf
1,New York[d],NY,8336817,8175133,0.0198,New York
2,Los Angeles,CA,3979576,3792621,0.0493,California
3,Chicago,IL,2693976,2695598,−0.06%,Illinois
4,Houston[3],TX,2320268,2100263,0.1048,Texas
5,Phoenix,AZ,1680992,1445632,0.1628,Arizona
6,San Antonio,TX,1547253,1327407,0.1656,Texas
7,San Diego,CA,1423851,1307402,0.0891,California
8,Dallas,TX,1343573,1197816,0.1217,Texas
9,San Jose,CA,1021795,945942,0.0802,California
10,Austin,TX,978908,790390,0.2385,Texas


Broadcast Variable

In [0]:
broadcastVariable=sc.broadcast(lookup)

In [0]:
type(broadcastVariable)

Out[42]: pyspark.broadcast.Broadcast

In [0]:
@udf
def map_state_code_bv(State_code):
    return broadcastVariable.value[State_code]

In [0]:
df=df.withColumn("State_Bvar", map_state_code_bv("State_Code"))
display(df)

2019_rank,City,State_Code,2019_estimate,2010_Census,Change,State_Udf,State_Bvar
1,New York[d],NY,8336817,8175133,0.0198,New York,New York
2,Los Angeles,CA,3979576,3792621,0.0493,California,California
3,Chicago,IL,2693976,2695598,−0.06%,Illinois,Illinois
4,Houston[3],TX,2320268,2100263,0.1048,Texas,Texas
5,Phoenix,AZ,1680992,1445632,0.1628,Arizona,Arizona
6,San Antonio,TX,1547253,1327407,0.1656,Texas,Texas
7,San Diego,CA,1423851,1307402,0.0891,California,California
8,Dallas,TX,1343573,1197816,0.1217,Texas,Texas
9,San Jose,CA,1021795,945942,0.0802,California,California
10,Austin,TX,978908,790390,0.2385,Texas,Texas


In [0]:
broadcastVariable.value["NY"]

Out[52]: 'New York'

In [0]:
# By default, broad cast variables are cached on the worker machine
# You can also unpersist them by calling .unpersist( )
broadcastVariable.unpersist()

If I call destroy() then all worker machines will destroy the copy their broad cast variable & not to use it for any task execution

In [0]:
broadcastVariable.destroy()

In [0]:
broadcastVariable.value["NY"]

[0;31m---------------------------------------------------------------------------[0m
[0;31mFileNotFoundError[0m                         Traceback (most recent call last)
File [0;32m<command-262696914761746>:1[0m
[0;32m----> 1[0m [43mbroadcastVariable[49m[38;5;241;43m.[39;49m[43mvalue[49m[[38;5;124m"[39m[38;5;124mNy[39m[38;5;124m"[39m]

File [0;32m/databricks/spark/python/pyspark/broadcast.py:282[0m, in [0;36mBroadcast.value[0;34m(self)[0m
[1;32m    280[0m         [38;5;28;01mreturn[39;00m [38;5;28mself[39m[38;5;241m.[39mload(decrypted_sock_file)
[1;32m    281[0m     [38;5;28;01melse[39;00m:
[0;32m--> 282[0m         [38;5;28mself[39m[38;5;241m.[39m_value [38;5;241m=[39m [38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43mload_from_path[49m[43m([49m[38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43m_path[49m[43m)[49m
[1;32m    283[0m [38;5;28;01mreturn[39;00m [38;5;28mself[39m[38;5;241m.[39m_value

File [0;32m/databricks/spar