# explode array and map columns to rows

* 有些表格，是 array or list column，

In [2]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.appName('pyspark-by-examples').getOrCreate()

arrayData = [
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})]

df = spark.createDataFrame(data=arrayData, schema = ['name','knownLanguages','properties'])
df.printSchema()
df.show(truncate = False)

root
 |-- name: string (nullable = true)
 |-- knownLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+----------+-------------------+-----------------------------+
|name      |knownLanguages     |properties                   |
+----------+-------------------+-----------------------------+
|James     |[Java, Scala]      |{eye -> brown, hair -> black}|
|Michael   |[Spark, Java, null]|{eye -> null, hair -> brown} |
|Robert    |[CSharp, ]         |{eye -> , hair -> red}       |
|Washington|null               |null                         |
|Jefferson |[1, 2]             |{}                           |
+----------+-------------------+-----------------------------+



## 用 explode() 把 array column 給攤平

In [4]:
df2 = df.select(df.name, F.explode(df.knownLanguages))
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)

+---------+------+
|     name|   col|
+---------+------+
|    James|  Java|
|    James| Scala|
|  Michael| Spark|
|  Michael|  Java|
|  Michael|  null|
|   Robert|CSharp|
|   Robert|      |
|Jefferson|     1|
|Jefferson|     2|
+---------+------+



## 用 explode() 把 dictionary column 給攤平成 key, value

In [6]:
df3 = df.select(df.name, F.explode(df.properties))
df3.printSchema()
df3.show()

root
 |-- name: string (nullable = true)
 |-- key: string (nullable = false)
 |-- value: string (nullable = true)

+-------+----+-----+
|   name| key|value|
+-------+----+-----+
|  James| eye|brown|
|  James|hair|black|
|Michael| eye| null|
|Michael|hair|brown|
| Robert| eye|     |
| Robert|hair|  red|
+-------+----+-----+



## 用 explode_outer() 把 null 和 空格都保留下來

In [7]:
df.select(df.name,F.explode_outer(df.knownLanguages)).show()

+----------+------+
|      name|   col|
+----------+------+
|     James|  Java|
|     James| Scala|
|   Michael| Spark|
|   Michael|  Java|
|   Michael|  null|
|    Robert|CSharp|
|    Robert|      |
|Washington|  null|
| Jefferson|     1|
| Jefferson|     2|
+----------+------+



In [9]:
df.select(df.name, F.explode_outer(df.properties)).show()

+----------+----+-----+
|      name| key|value|
+----------+----+-----+
|     James| eye|brown|
|     James|hair|black|
|   Michael| eye| null|
|   Michael|hair|brown|
|    Robert| eye|     |
|    Robert|hair|  red|
|Washington|null| null|
| Jefferson|null| null|
+----------+----+-----+



## 用 poseexplode() 把原本的 position 也存出來

In [10]:
df.select(df.name,F.posexplode(df.knownLanguages)).show()

+---------+---+------+
|     name|pos|   col|
+---------+---+------+
|    James|  0|  Java|
|    James|  1| Scala|
|  Michael|  0| Spark|
|  Michael|  1|  Java|
|  Michael|  2|  null|
|   Robert|  0|CSharp|
|   Robert|  1|      |
|Jefferson|  0|     1|
|Jefferson|  1|     2|
+---------+---+------+



In [11]:
df.select(df.name, F.posexplode(df.properties)).show()

+-------+---+----+-----+
|   name|pos| key|value|
+-------+---+----+-----+
|  James|  0| eye|brown|
|  James|  1|hair|black|
|Michael|  0| eye| null|
|Michael|  1|hair|brown|
| Robert|  0| eye|     |
| Robert|  1|hair|  red|
+-------+---+----+-----+



## 用 posexplode_outer() 把 null 和空白都留下來

* 原本的資料，在 `knownLanguages` 的第四列，是 null，所以如果你下 `poseexplode()` 時，他根本也無法存第四列的 position，他就會濾掉

In [14]:
df.show(truncate = False)

+----------+-------------------+-----------------------------+
|name      |knownLanguages     |properties                   |
+----------+-------------------+-----------------------------+
|James     |[Java, Scala]      |{eye -> brown, hair -> black}|
|Michael   |[Spark, Java, null]|{eye -> null, hair -> brown} |
|Robert    |[CSharp, ]         |{eye -> , hair -> red}       |
|Washington|null               |null                         |
|Jefferson |[1, 2]             |{}                           |
+----------+-------------------+-----------------------------+



* 那現在改用 `posexplode_outer()`，就可以保留這列：

In [13]:
df.select("name", F.posexplode_outer("knownLanguages")).show()

+----------+----+------+
|      name| pos|   col|
+----------+----+------+
|     James|   0|  Java|
|     James|   1| Scala|
|   Michael|   0| Spark|
|   Michael|   1|  Java|
|   Michael|   2|  null|
|    Robert|   0|CSharp|
|    Robert|   1|      |
|Washington|null|  null|
| Jefferson|   0|     1|
| Jefferson|   1|     2|
+----------+----+------+



* 同理，用在 dictionary 欄位也一樣：

In [16]:
df.select(df.name, F.posexplode_outer(df.properties)).show()

+----------+----+----+-----+
|      name| pos| key|value|
+----------+----+----+-----+
|     James|   0| eye|brown|
|     James|   1|hair|black|
|   Michael|   0| eye| null|
|   Michael|   1|hair|brown|
|    Robert|   0| eye|     |
|    Robert|   1|hair|  red|
|Washington|null|null| null|
| Jefferson|null|null| null|
+----------+----+----+-----+

