In [0]:
in_path="dbfs:/FileStore/tables/mart_in_json.json"
file_format="json"

df1 = (
       spark.read.format(file_format)
                 .option('multiLine',True)
                 .load(in_path)
)

df1.select("Area","fruits","vegetable","Stationary").show()
df1.printSchema()

+------+--------------+--------------+-------------+
|  Area|        fruits|     vegetable|   Stationary|
+------+--------------+--------------+-------------+
|CHN004|{apple, 10.25}|{tomato, 5.25}|{Pencil, 3.1}|
|DLN004|{apple, 12.15}|{tomato, 9.25}|{Pencil, 4.1}|
|MUM011|  {apple, 9.5}| {tomato, 5.1}|{Pencil, 3.7}|
+------+--------------+--------------+-------------+

root
 |-- Appliance: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- Area: string (nullable = true)
 |-- Month: long (nullable = true)
 |-- Stationary: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- Type: string (nullable = true)
 |-- Vegetable: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- fruits: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- year: long (nullable 

In [0]:
df1.display()

Appliance,Area,Month,Stationary,Type,Vegetable,fruits,year
"List(Mixer, 113.2)",CHN004,5,"List(Pencil, 3.1)",Retail Super Market,"List(tomato, 5.25)","List(apple, 10.25)",2022
"List(Mixer, 103.9)",DLN004,5,"List(Pencil, 4.1)",Retail Super Market,"List(tomato, 9.25)","List(apple, 12.15)",2022
"List(Mixer, 117.7)",MUM011,5,"List(Pencil, 3.7)",Retail Super Market,"List(tomato, 5.1)","List(apple, 9.5)",2022


##**Add column to structType field**

In [0]:
df1.select('fruits.*').display()

item,rate
apple,10.25
apple,12.15
apple,9.5


In [0]:
from pyspark.sql.functions import lit
df1.withColumn('fruits.discount',lit(10)).printSchema()

root
 |-- Appliance: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- Area: string (nullable = true)
 |-- Month: long (nullable = true)
 |-- Stationary: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- Type: string (nullable = true)
 |-- Vegetable: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- fruits: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- year: long (nullable = true)
 |-- fruits.discount: integer (nullable = false)



###Solution with Strcut

In [0]:
#import struct function
from pyspark.sql.functions import struct,col

#Get Existing Struct Schema of fruits from dataframe
s_fields = df1.schema["fruits"].dataType.names

#add new column using struct
df2=(
  df1.withColumn("fruits",
                 struct(*([col('fruits')[c].alias(c) for c in s_fields]
                        + [lit(10).alias('discount')])))
    )

#out schema
df2.printSchema()

root
 |-- Appliance: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- Area: string (nullable = true)
 |-- Month: long (nullable = true)
 |-- Stationary: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- Type: string (nullable = true)
 |-- Vegetable: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- fruits: struct (nullable = false)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |    |-- discount: integer (nullable = false)
 |-- year: long (nullable = true)



In [0]:
s_fields

Out[12]: ['item', 'rate']

In [0]:
df2.select("Area", "fruits", "vegetable","Stationary").display()

Area,fruits,vegetable,Stationary
CHN004,"List(apple, 10.25, 10)","List(tomato, 5.25)","List(Pencil, 3.1)"
DLN004,"List(apple, 12.15, 10)","List(tomato, 9.25)","List(Pencil, 4.1)"
MUM011,"List(apple, 9.5, 10)","List(tomato, 5.1)","List(Pencil, 3.7)"


## Remove a Column from StructType()

In [0]:
df3 = df2.drop('Vegetable.rate')
df3.printSchema()

root
 |-- Appliance: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- Area: string (nullable = true)
 |-- Month: long (nullable = true)
 |-- Stationary: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- Type: string (nullable = true)
 |-- Vegetable: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- fruits: struct (nullable = false)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |    |-- discount: integer (nullable = false)
 |-- year: long (nullable = true)



**We cannot use drop function to remove the column inside  StructType. So we can remove the column from the schema and use withColumn to get the new dataframe with column  inside StructType dropped.**

In [0]:
#get the existing schema
s_fields = df1.schema["Vegetable"].dataType.names

#Remove the column from the list 
s_fields.remove("rate")

#define the column with removed schema
df3=df2.withColumn("Vegetable",struct(*([col('Vegetable')[c].alias(c) for c in s_fields])))

#New dataframe
df3.printSchema()

root
 |-- Appliance: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- Area: string (nullable = true)
 |-- Month: long (nullable = true)
 |-- Stationary: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- Type: string (nullable = true)
 |-- Vegetable: struct (nullable = false)
 |    |-- item: string (nullable = true)
 |-- fruits: struct (nullable = false)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |    |-- discount: integer (nullable = false)
 |-- year: long (nullable = true)



In [0]:
df3.select("Area","fruits","vegetable","Stationary").display()

Area,fruits,vegetable,Stationary
CHN004,"List(apple, 10.25, 10)",List(tomato),"List(Pencil, 3.1)"
DLN004,"List(apple, 12.15, 10)",List(tomato),"List(Pencil, 4.1)"
MUM011,"List(apple, 9.5, 10)",List(tomato),"List(Pencil, 3.7)"


## Cast a Column inside StructType()

In [0]:
#Get the schema
s_fields = df1.schema["Stationary"].dataType.names
s_fields.remove("rate")


#cast Rate column as string
df4=(
  df3.withColumn("Stationary",
                 struct(*([col('Stationary')[c].alias(c) for c in s_fields] + 
                          [col('Stationary')['rate'].cast('string').alias('rate')]))
                )
    )

#print Schema
df4.printSchema()

root
 |-- Appliance: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- Area: string (nullable = true)
 |-- Month: long (nullable = true)
 |-- Stationary: struct (nullable = false)
 |    |-- item: string (nullable = true)
 |    |-- rate: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Vegetable: struct (nullable = false)
 |    |-- item: string (nullable = true)
 |-- fruits: struct (nullable = false)
 |    |-- item: string (nullable = true)
 |    |-- rate: double (nullable = true)
 |    |-- discount: integer (nullable = false)
 |-- year: long (nullable = true)



In [0]:
df4.select("Area","fruits","vegetable","Stationary").display()

Area,fruits,vegetable,Stationary
CHN004,"List(apple, 10.25, 10)",List(tomato),"List(Pencil, 3.1)"
DLN004,"List(apple, 12.15, 10)",List(tomato),"List(Pencil, 4.1)"
MUM011,"List(apple, 9.5, 10)",List(tomato),"List(Pencil, 3.7)"
