In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

path_shows = ("/Volumes/workspace/dataanalysispysparkbook/bronze_files/eda/shows/")

In [0]:
shows = spark.read.json(path_shows + "shows-silicon-valley.json")
shows.limit(5).display()

Ahora vamos a cargar los tres archivos JSON que están en la misma ruta "show" con el mismo formato:

In [0]:
three_shows = spark.read.json(path_shows, multiLine=True)
three_shows.limit(5).display()
three_shows.count()
assert three_shows.count() == 3
print(three_shows.columns)

### The Array
Whe you have more than one value

In [0]:
array_subset = shows.select("name", "genres")
array_subset.limit(5).display()

In [0]:
array_subset = array_subset.select(
        "name",
        array_subset.genres[0].alias("dot_and_index"),
        F.col("genres")[0].alias("col_and_index"),
        array_subset.genres.getItem(0).alias("dot_and_method"),
        F.col("genres").getItem(0).alias("col_and_method"),
)
array_subset.show()

In [0]:
array_subset_repeated = array_subset.select(
    "name",
    F.lit("Comedy").alias("one"),
    F.lit("Horror").alias("two"),
    F.lit("Drama").alias("three"),
    F.col("dot_and_index"),
).select(
    "name",
    F.array("one", "two", "three").alias("Some_Genres"),
    F.array_repeat("dot_and_index", 5).alias("Repeated_Genres"),
)
array_subset_repeated.show(1, False)

In [0]:
array_subset_repeated.select(
  "name", F.size("Some_Genres"), F.size("Repeated_Genres")
).show()

In [0]:
array_subset_repeated.select(
    "name",
    F.array_distinct("Some_Genres"),
    F.array_distinct("Repeated_Genres"),
).show(1, False)

In [0]:
# Using intersect() method to find common elements
array_subset_repeated.select(
        "name",
        F.array_intersect("Some_Genres", "Repeated_Genres").alias("Genres"),
).show(1, False)

Using array_position() to search to find Genres string

In [0]:
array_subset_repeated.select(
  "Some_Genres", F.array_position("Some_Genres", "Comedy")
).show(1, False)

Creating a map from two arrays

In [0]:
columns = ["name", "language", "type"]

shows_map = shows.select(
    *[F.lit(column) for column in columns],
    F.array(*columns).alias("values"),
)

shows_map = shows_map.select(F.array(*columns).alias("keys"), "values")
shows_map.show(1, False)

# Using explode() method to flatten the array
exploded = shows_map.select(F.explode("values").alias("value"))
exploded.show(1, False)

In [0]:
shows_map = shows_map.select(
    F.map_from_arrays("keys", "values").alias("mapped")
)
shows_map.printSchema()
shows_map.show(1, False)

In [0]:
shows_map.select(
    F.col("mapped.name"),
    F.col("mapped")["name"],
    shows_map.mapped["name"],
).show()