### Reading parquet data with an inferred schema

In [0]:
df = (spark.read.format("json")
      .option("multiLine", "true")
      .load("/Volumes/mycatalog/myschema/myvolume/repofiles/Data-Engineering-with-Databricks-Cookbook-main/data/Stanford Question Answering Dataset.json"))

In [0]:
df.printSchema()

In [0]:
from pyspark.sql.functions import explode, col, array_distinct

In [0]:
df_exploded = (
    df.select("title"
              , explode("paragraphs").alias("paragraphs"))
    .select("title"
            ,col("paragraphs.context").alias ("context")
            ,explode(col("paragraphs.qas")).alias("questions")))

df_exploded.show()

In [0]:
df_array_distinct = (
    df_exploded.select("title","context"
                       ,col("questions.id").alias("question_id")
                       ,col("questions.question").alias("question_text")
                       ,array_distinct("questions.answers").alias("answers")))

df_array_distinct.show()

In [0]:
(df_array_distinct
 .select("title","context","question_text"
         ,col("answers").getItem(0).getField("text"))
 .show())

### Large number of rows with explode 

In [0]:
(df_array_distinct
 .select("title","context","question_text"
         , col("answers").getItem(0).getField("text").alias('answer'))
 .show())

### Nested data with null values 

In [0]:
(df_array_distinct
 .filter(col("answers").getItem(0).getField("text").isNotNull())
 .show())

### `array_contains()`

In [0]:
from pyspark.sql.functions import array_contains

df = spark.createDataFrame(
    [(["apple", "orange", "banana"],)
     ,(["grape", "kiwi", "melon"],)
     ,(["pear", "apple", "pineapple"],)]
    ,["fruits"])

(df.select("fruits"
           , array_contains("fruits", "apple")
           .alias("contains_apple"))
 .show(truncate=False))


### `map_keys()` and `map_values()`

In [0]:
data = [
    {"user_info": {"name": "Alice", "age": 28, "email": "alice@example.com"}},
    {"user_info": {"name": "Bob", "age": 35, "email": "bob@example.com"}},
    {"user_info": {"name": "Charlie", "age": 42, "email": "charlie@example.com"}}
]

df = spark.createDataFrame(data)
df.show(truncate=False)


In [0]:
from pyspark.sql.functions import map_keys, map_values

In [0]:
(df
 .select("user_info"
         , map_keys("user_info").alias("user_info_keys") 
         , map_values("user_info").alias("user_info_values"))
 .show(truncate=False))

### `explode_outer()`

In [0]:
from pyspark.sql.functions import explode_outer

In [0]:
data = [
    {"words": ["hello", "world"]},
    {"words": ["foo", "bar", "baz"]},
    {"words": None}
]

df = spark.createDataFrame(data)

(df.select(explode_outer("words").alias("word"))
 .show(truncate=False))

### `posexplode()`

In [0]:
data = [
    {"words": ["hello", "world"]},
    {"words": ["foo", "bar", "baz"]},
    {"words": None}
]

df = spark.createDataFrame(data)

df.selectExpr("posexplode(words) as (pos, word)").show(truncate=False)