In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, flatten


spark = SparkSession.builder.appName('pyspark-by-examples').getOrCreate()

arrayArrayData = [
  ("James",[["Java","Scala","C++"],["Spark","Java"]]),
  ("Michael",[["Spark","Java","C++"],["Spark","Java"]]),
  ("Robert",[["CSharp","VB"],["Spark","Python"]])
]

df = spark.createDataFrame(data=arrayArrayData, schema = ['name','subjects'])
df.printSchema()
df.show(truncate=False)

""" """
df.select(df.name,explode(df.subjects)).show(truncate=False)

""" creates a single array from an array of arrays. """
df.select(df.name,flatten(df.subjects)).show(truncate=False)

"""END"""

root
 |-- name: string (nullable = true)
 |-- subjects: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)

+-------+-----------------------------------+
|name   |subjects                           |
+-------+-----------------------------------+
|James  |[[Java, Scala, C++], [Spark, Java]]|
|Michael|[[Spark, Java, C++], [Spark, Java]]|
|Robert |[[CSharp, VB], [Spark, Python]]    |
+-------+-----------------------------------+

+-------+------------------+
|name   |col               |
+-------+------------------+
|James  |[Java, Scala, C++]|
|James  |[Spark, Java]     |
|Michael|[Spark, Java, C++]|
|Michael|[Spark, Java]     |
|Robert |[CSharp, VB]      |
|Robert |[Spark, Python]   |
+-------+------------------+

+-------+-------------------------------+
|name   |flatten(subjects)              |
+-------+-------------------------------+
|James  |[Java, Scala, C++, Spark, Java]|
|Michael|[Spark, Java, C++, Spark, Ja

In [0]:
#It imports the necessary modules: pyspark and SparkSession.
#It creates a SparkSession object named spark with the configuration appName('pyspark-by-examples').
#It defines an array of array data arrayArrayData containing tuples with a name and subjects.
#It creates a DataFrame df by calling spark.createDataFrame() and passing arrayArrayData and the schema as arguments. The schema is defined with two columns: 'name' and 'subjects'. The resulting DataFrame is displayed using df.printSchema() and df.show().
#It imports the explode and flatten functions from pyspark.sql.functions.
#It applies the explode function to the 'subjects' column of df by calling df.select() and passing df.name and explode(df.subjects) as arguments. The resulting DataFrame is displayed using df.select(df.name,explode(df.subjects)).show(). This function creates a new row for each element of the array in the 'subjects' column, along with the corresponding 'name' value.
#It applies the flatten function to the 'subjects' column of df by calling df.select() and passing df.name and flatten(df.subjects) as arguments. The resulting DataFrame is displayed using df.select(df.name,flatten(df.subjects)).show(). This function creates a single array by merging all arrays in the 'subjects' column.