## Create dataframe with array column

In [0]:
array = [
    ('Raja', ["TV", "Refrigerator", "Oven", "AC"]),
    ('Aditi', ["Washing Machine", "Microwave", None]),
    ('Vikram', ["TV", "Laptop"]),
    ('Sita', ["Refrigerator", "TV", "Blender", "Fan"]),
    ('John', ["Oven", "AC", None]),
    
]

df_app = spark.createDataFrame(data = array, schema = ['Name', 'Appliances'])
df_app.printSchema()
display(df_app)

root
 |-- Name: string (nullable = true)
 |-- Appliances: array (nullable = true)
 |    |-- element: string (containsNull = true)



Name,Appliances
Raja,"List(TV, Refrigerator, Oven, AC)"
Aditi,"List(Washing Machine, Microwave, null)"
Vikram,"List(TV, Laptop)"
Sita,"List(Refrigerator, TV, Blender, Fan)"
John,"List(Oven, AC, null)"


## Create dataframe with map column

In [0]:
map_brand = [
    ('Raja', {"TV":"SamSung", "Refrigerator":"LG", "Oven":"LG", "AC":"Dolphin"}),
    ('Aditi', {"Washing Machine":"Bosch", "Microwave":"LG"}),
    ('Vikram', {"TV":"Intel", "Laptop":"Lenovo"}),
    ('Sita', {"Refrigerator":"Daikin", "TV":"Sony"}),
    ('John',  None)
]
df_brand = spark.createDataFrame(data = map_brand, schema =['Name','Brand'])
df_brand.printSchema()
display(df_brand)

root
 |-- Name: string (nullable = true)
 |-- Brand: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



Name,Brand
Raja,"Map(Refrigerator -> LG, AC -> Dolphin, TV -> SamSung, Oven -> LG)"
Aditi,"Map(Microwave -> LG, Washing Machine -> Bosch)"
Vikram,"Map(Laptop -> Lenovo, TV -> Intel)"
Sita,"Map(Refrigerator -> Daikin, TV -> Sony)"
John,


### Explode with array

In [0]:
from pyspark.sql.functions import explode

df2 = df_app.select(df_app.Name,explode(df_app.Appliances))



In [0]:
display(df2)

Name,col
Raja,TV
Raja,Refrigerator
Raja,Oven
Raja,AC
Aditi,Washing Machine
Aditi,Microwave
Aditi,
Vikram,TV
Vikram,Laptop
Sita,Refrigerator


Chức năng của explode là khi đang có 1 dataframe có dạng 1 cột là tên biến và 1 cột là array các giá trị của biến đó thì khi dùng explode nó sẽ chia ra thành 2 cột key value với key là tên biến và value là từng giá trị trong array.

### Explode with map

In [0]:
df3 = df_brand.select(df_brand.Name,explode(df_brand.Brand))
df3.printSchema()
display(df3)

root
 |-- Name: string (nullable = true)
 |-- key: string (nullable = false)
 |-- value: string (nullable = true)



Name,key,value
Raja,Refrigerator,LG
Raja,AC,Dolphin
Raja,TV,SamSung
Raja,Oven,LG
Aditi,Microwave,LG
Aditi,Washing Machine,Bosch
Vikram,Laptop,Lenovo
Vikram,TV,Intel
Sita,Refrigerator,Daikin
Sita,TV,Sony


Khi dùng với map sẽ chia ra làm 3 cột đó là 1 cột tên ban đầu, cột key và cột value

## Explode_outer to consider NULL values

In [0]:
from pyspark.sql.functions import explode_outer

display(df_app.select(df_app.Name,explode_outer(df_app.Appliances)))

display(df_brand.select(df_brand.Name, explode_outer(df_brand.Brand)))

Name,col
Raja,TV
Raja,Refrigerator
Raja,Oven
Raja,AC
Aditi,Washing Machine
Aditi,Microwave
Aditi,
Vikram,TV
Vikram,Laptop
Sita,Refrigerator


Name,key,value
Raja,Refrigerator,LG
Raja,AC,Dolphin
Raja,TV,SamSung
Raja,Oven,LG
Aditi,Microwave,LG
Aditi,Washing Machine,Bosch
Vikram,Laptop,Lenovo
Vikram,TV,Intel
Sita,Refrigerator,Daikin
Sita,TV,Sony


## Positional Explode

In [0]:
from pyspark.sql.functions import posexplode

display(df_app.select(df_app.Name,posexplode(df_app.Appliances)))

display(df_brand.select(df_brand.Name,posexplode(df_brand.Brand)))

Name,pos,col
Raja,0,TV
Raja,1,Refrigerator
Raja,2,Oven
Raja,3,AC
Aditi,0,Washing Machine
Aditi,1,Microwave
Aditi,2,
Vikram,0,TV
Vikram,1,Laptop
Sita,0,Refrigerator


Name,pos,key,value
Raja,0,Refrigerator,LG
Raja,1,AC,Dolphin
Raja,2,TV,SamSung
Raja,3,Oven,LG
Aditi,0,Microwave,LG
Aditi,1,Washing Machine,Bosch
Vikram,0,Laptop,Lenovo
Vikram,1,TV,Intel
Sita,0,Refrigerator,Daikin
Sita,1,TV,Sony


Khi dùng posexplode sẽ có thêm 1 cột về vị trí các key value trong array ở dataFrame ban đầu

In [0]:
from pyspark.sql.functions import posexplode_outer

display(df_app.select(df_app.Name,posexplode_outer(df_app.Appliances)))

display(df_brand.select(df_brand.Name,posexplode_outer(df_brand.Brand)))

Name,pos,col
Raja,0,TV
Raja,1,Refrigerator
Raja,2,Oven
Raja,3,AC
Aditi,0,Washing Machine
Aditi,1,Microwave
Aditi,2,
Vikram,0,TV
Vikram,1,Laptop
Sita,0,Refrigerator


Name,pos,key,value
Raja,0.0,Refrigerator,LG
Raja,1.0,AC,Dolphin
Raja,2.0,TV,SamSung
Raja,3.0,Oven,LG
Aditi,0.0,Microwave,LG
Aditi,1.0,Washing Machine,Bosch
Vikram,0.0,Laptop,Lenovo
Vikram,1.0,TV,Intel
Sita,0.0,Refrigerator,Daikin
Sita,1.0,TV,Sony
