### Import Reuired Libraries

In [61]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

### Create SparkSession and SparkContext

In [62]:
spark = SparkSession.builder.getOrCreate()

### Sample Data

### Read Sample JSON File

In [63]:
rawDF = spark.read.json("../data/sample.json", multiLine = "true")

### Explore DataFrame Schema

In [64]:
rawDF.printSchema()

root
 |-- batters: struct (nullable = true)
 |    |-- batter: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- ppu: double (nullable = true)
 |-- topping: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- type: string (nullable = true)



# Convert "batters" Nested Structure to Simple DataFrame

### Rename ID to Key

In [65]:
sampleDF = rawDF.withColumnRenamed("id", "key")

### Select "batters" columns

In [66]:
batDF = sampleDF.select("key", "batters.batter")
batDF.printSchema()

root
 |-- key: string (nullable = true)
 |-- batter: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- type: string (nullable = true)



In [67]:
batDF.show(1, False)

+----+-----------------------------------------------------------------------------+
|key |batter                                                                       |
+----+-----------------------------------------------------------------------------+
|0001|[[1001, Regular], [1002, Chocolate], [1003, Blueberry], [1004, Devil's Food]]|
+----+-----------------------------------------------------------------------------+



### Creating a row for each element : explode

In [68]:
bat2DF = batDF.select("key", explode("batter").alias("new_batter"))
bat2DF.show()

+----+--------------------+
| key|          new_batter|
+----+--------------------+
|0001|     [1001, Regular]|
|0001|   [1002, Chocolate]|
|0001|   [1003, Blueberry]|
|0001|[1004, Devil's Food]|
+----+--------------------+



In [69]:
bat2DF.printSchema()

root
 |-- key: string (nullable = true)
 |-- new_batter: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- type: string (nullable = true)



In [70]:
bat2DF.select("key", "new_batter.*").show()

+----+----+------------+
| key|  id|        type|
+----+----+------------+
|0001|1001|     Regular|
|0001|1002|   Chocolate|
|0001|1003|   Blueberry|
|0001|1004|Devil's Food|
+----+----+------------+



### Creating a row for each struct element : explode

In [71]:
finalBatDF = (sampleDF
        .select("key", explode("batters.batter").alias("new_batter"))
        .select("key", "new_batter.*")
        .withColumnRenamed("id", "bat_id")
        .withColumnRenamed("type", "bat_type"))
finalBatDF.show()

+----+------+------------+
| key|bat_id|    bat_type|
+----+------+------------+
|0001|  1001|     Regular|
|0001|  1002|   Chocolate|
|0001|  1003|   Blueberry|
|0001|  1004|Devil's Food|
+----+------+------------+



### Select element from Array : topping

In [72]:
sampleDF.select(col("topping").getItem(0).id.alias("top_id"), col("topping").getItem(0).type.alias("top_type")).show()

+------+--------+
|top_id|top_type|
+------+--------+
|  5001|    None|
+------+--------+



# Convert "toppings" nested structure to simple DataFrame

### Creating a row for each Array element : explode

In [73]:
topDF = (sampleDF
        .select("key", explode("topping").alias("new_topping"))
        .select("key", "new_topping.*")
        .withColumnRenamed("id", "top_id")
        .withColumnRenamed("type", "top_type")
        )
topDF.show(10, False)

+----+------+------------------------+
|key |top_id|top_type                |
+----+------+------------------------+
|0001|5001  |None                    |
|0001|5002  |Glazed                  |
|0001|5005  |Sugar                   |
|0001|5007  |Powdered Sugar          |
|0001|5006  |Chocolate with Sprinkles|
|0001|5003  |Chocolate               |
|0001|5004  |Maple                   |
+----+------+------------------------+



## Thank You