In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

path_shows = ("/Volumes/workspace/dataanalysispysparkbook/bronze_files/eda/shows/")

In [0]:
shows = spark.read.json(path_shows + "shows-silicon-valley.json")
shows.limit(5).display()

In [0]:
shows.select("schedule").printSchema()

Navigating structs as if they were nested columns

In [0]:
shows.select(F.col("_embedded")).printSchema()

In [0]:
shows_clean = (shows.withColumn("episodes", F.col("_embedded.episodes"))
                    .drop("_embedded")
)
shows_clean.printSchema()

In [0]:
episodes_name = shows_clean.select(F.col("episodes.name"))
episodes_name.printSchema()
episodes_name.show(3, truncate=100)

episodes_name.select(F.explode("name").alias("name")) \
             .show(3, False)

### Building and using the data frame schema

In [0]:
import pyspark.sql.types as T

In [0]:
episode_links_schema = T.StructType(
    [
        T.StructField(
            "self", T.StructType([T.StructField("href", T.StringType())])
        )
    ]
)

episode_image_schema = T.StructType(
    [
        T.StructField("medium", T.StringType()),
        T.StructField("original", T.StringType()),
    ]
)

episode_schema = T.StructType(
    [
        T.StructField("_links", episode_links_schema),
        T.StructField("airdate", T.DateType()),
        T.StructField("airstamp", T.TimestampType()),
        T.StructField("airtime", T.StringType()),
        T.StructField("id", T.StringType()),
        T.StructField("image", episode_image_schema),
        T.StructField("name", T.StringType()),
        T.StructField("number", T.LongType()),
        T.StructField("runtime", T.LongType()),
        T.StructField("season", T.LongType()),
        T.StructField("summary", T.StringType()),
        T.StructField("url", T.StringType()),
    ]
)

embedded_schema = T.StructType(
    [
        T.StructField(
            "_embedded",
            T.StructType(
                [
                    T.StructField(
                        "episodes", T.ArrayType(episode_schema)
                    )
                ]
            ),
        )
    ]
)

Reading a JSON document using an explicit partial schema

In [0]:
shows_with_schema = spark.read.json(
    "./data/shows/shows-silicon-valley.json",
    schema=embedded_schema,
    mode="FAILFAST", # By selecting the FAILFAST mode, our DataFrameReader will crash if our schema is incompatible.
)


Validating the airdate and airstamp field reading

In [0]:
for column in ["airdate", "airstamp"]:
    shows.select(f"_embedded.episodes.{column}")\
    .select(F.explode(column)).show(5)

Pretty-printing the schema

In [0]:
import pprint

pprint.pprint(
    shows_with_schema.select(
        F.explode("_embedded.episodes").alias("episode")
    )
    .select("episode.airtime")
    .schema.jsonValue()
)
# {'fields': [{'metadata': {},
# 'name': 'airtime',
# 'nullable': True,
# 'type': 'string'}],
# 'type': 'struct'}

Pretty-printing dummy complex types

In [0]:
pprint.pprint(
  T.StructField("array_example", T.ArrayType(T.StringType())).jsonValue()
)
# {'metadata': {},
# 'name': 'array_example',
# 'nullable': True,
# 'type': {'containsNull': True, 'elementType': 'string', 'type': 'array'}}
pprint.pprint(
  T.StructField(
    "map_example", T.MapType(T.StringType(), T.LongType())
  ).jsonValue()
)
# {'metadata': {},
# 'name': 'map_example',
# 'nullable': True,
# 'type': {'keyType': 'string',
# 'type': 'map',
# 'valueContainsNull': True,
# 'valueType': 'long'}}
pprint.pprint(
  T.StructType(
    [
      T.StructField(
      "map_example", T.MapType(T.StringType(), T.LongType())
    ),
      T.StructField("array_example", T.ArrayType(T.StringType())),
    ]
  ).jsonValue()
)
# {'fields': [{'metadata': {},
                # 'name': 'map_example',
                # 'nullable': True,
                # 'type': {'keyType': 'string',
                            # 'type': 'map',
                            # 'valueContainsNull': True,
                            # 'valueType': 'long'}},
                # {'metadata': {},
                  # 'name': 'array_example',
                  # 'nullable': True,
                  # 'type': {'containsNull': True,
                              # 'elementType': 'string',
                              # 'type': 'array'}}],
# 'type': 'struct'}

Validating JSON schema is equal to data frame schema

In [0]:
import json
other_shows_schema = T.StructType.fromJson(json.loads(shows_with_schema.schema.json()))
print(other_shows_schema == shows_with_schema.schema) # True