# explode nested array into rows

* 先做出 Nested array 的範例資料

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.appName('pyspark-by-examples').getOrCreate()

arrayArrayData = [
  ("James",[["Java","Scala","C++"],["Spark","Java"]]),
  ("Michael",[["Spark","Java","C++"],["Spark","Java"]]),
  ("Robert",[["CSharp","VB"],["Spark","Python"]])
]

df = spark.createDataFrame(data=arrayArrayData, schema = ['name','subjects'])

df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- subjects: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)

+-------+-----------------------------------+
|name   |subjects                           |
+-------+-----------------------------------+
|James  |[[Java, Scala, C++], [Spark, Java]]|
|Michael|[[Spark, Java, C++], [Spark, Java]]|
|Robert |[[CSharp, VB], [Spark, Python]]    |
+-------+-----------------------------------+



* 如果用之前學過的 `explode()` function，他只會幫你解掉一層，如下：

In [3]:
df.select(df.name, F.explode(df.subjects)).show(truncate=False)

+-------+------------------+
|name   |col               |
+-------+------------------+
|James  |[Java, Scala, C++]|
|James  |[Spark, Java]     |
|Michael|[Spark, Java, C++]|
|Michael|[Spark, Java]     |
|Robert |[CSharp, VB]      |
|Robert |[Spark, Python]   |
+-------+------------------+



* 用這章教的 `floatten()` function，則是幫你把 nested array，變成單一 array

In [9]:
df.select(df.name, F.flatten(df.subjects).alias("subjects")).show(truncate=False)

+-------+-------------------------------+
|name   |subjects                       |
+-------+-------------------------------+
|James  |[Java, Scala, C++, Spark, Java]|
|Michael|[Spark, Java, C++, Spark, Java]|
|Robert |[CSharp, VB, Spark, Python]    |
+-------+-------------------------------+



* 那使用 combo 技，就可以完全解開了：

In [19]:
df \
    .select(df.name, F.flatten(df.subjects).alias("subjects")) \
    .select(F.col("name"), F.explode(F.col("subjects")).alias("subjects")) \
    .show()

+-------+--------+
|   name|subjects|
+-------+--------+
|  James|    Java|
|  James|   Scala|
|  James|     C++|
|  James|   Spark|
|  James|    Java|
|Michael|   Spark|
|Michael|    Java|
|Michael|     C++|
|Michael|   Spark|
|Michael|    Java|
| Robert|  CSharp|
| Robert|      VB|
| Robert|   Spark|
| Robert|  Python|
+-------+--------+

