### Reading json data with an inferred schema

In [0]:
%pip install spark-xml

In [0]:
# Read XML file into a DataFrame
df = (spark.read.format("com.databricks.spark.xml")
      .option("rowTag", "row")
      .load("/Volumes/mycatalog/myschema/myvolume/repofiles/Data-Engineering-with-Databricks-Cookbook-main/data/nobel_prizes.xml"))

In [0]:
df.printSchema()

In [0]:
# Display contents of DataFrame
df.show()

# Alternatively

# df.show(50)  # Display first 50 rows
# df.show(10, truncate=False)  # Display first 10 rows without truncation

In [0]:
df.select("category", "year").show()

In [0]:
(df.select("category", "year"
     , col("laureates").getItem(0).id).show())

In [0]:
df_flattened = (
    df
    .withColumn("laureates",explode(col("laureates")))
    .select(col("category")
            , col("year")
            , col("overallMotivation")
            , col("laureates.id")
            , col("laureates.firstname")
            , col("laureates.surname")
            , col("laureates.share")
            , col("laureates.motivation")))

df_flattened.show(truncate=False)

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

schema = StructType(
    [StructField('category', StringType(), True), 
     StructField('laureates', ArrayType(StructType(
         [StructField('firstname', StringType(), True),
          StructField('id', StringType(), True), 
          StructField('motivation', StringType(), True), 
          StructField('share', StringType(), True), 
          StructField('surname', StringType(), True)]), True), True),
     StructField('overallMotivation', StringType(), True),
     StructField('year', IntegerType(), True)])

# Read XML file into a DataFrame
df_with_schema = (spark.read.format("com.databricks.spark.xml")
                  .schema(schema)
                  .option("rowTag", "row")
                  .load("/Volumes/mycatalog/myschema/myvolume/repofiles/Data-Engineering-with-Databricks-Cookbook-main/data/nobel_prizes.xml"))

df_with_schema.show()