In [1]:
import pyspark.sql.functions as fn

In [2]:
# set a spark context
sc = spark.sparkContext

# define our data as JSON
people_json = ['[{"name":{"first":"Alice", "last": "Summers"},"age": "34", "cars": ["honda"],  "locations":[{"type": "work", "city":"Glasgow"}, {"type": "home", "city": "Edinburgh"}]},\
                 {"name":{"first":"Bob", "last": "Winters"},"age": "33", "cars": ["ford", "BMW"], "locations":[{"type": "work", "city":"Glasgow"}, {"type": "home", "city": "Edinburgh"}]},\
                 {"name":{"first":"Charlie", "last": "Spring"},"age": "35", "cars": ["vauxhall", "peugeot", "VW"], "locations":[{"type": "work", "city":"Bristol"}, {"type": "home", "city": "Bath"}, {"type":"holiday", "city": "Paris"}]}]']

# convert to RDD
people_rdd = sc.parallelize(people_json)
# convert to dataframe
people_df = spark.read.json(people_rdd)
# rearrange column order
people_df = people_df.select("age", "name", "cars", "locations")

display(people_df)

age,name,cars,locations
34,"List(Alice, Summers)",List(honda),"List(List(Glasgow, work), List(Edinburgh, home))"
33,"List(Bob, Winters)","List(ford, BMW)","List(List(Glasgow, work), List(Edinburgh, home))"
35,"List(Charlie, Spring)","List(vauxhall, peugeot, VW)","List(List(Bristol, work), List(Bath, home), List(Paris, holiday))"


In [3]:
people_df.printSchema()

In [4]:
# extracting data from a map (dictionary) column
surname_df = people_df.withColumn("surname", fn.col("name")["last"])
display(surname_df)

age,name,cars,locations,surname
34,"List(Alice, Summers)",List(honda),"List(List(Glasgow, work), List(Edinburgh, home))",Summers
33,"List(Bob, Winters)","List(ford, BMW)","List(List(Glasgow, work), List(Edinburgh, home))",Winters
35,"List(Charlie, Spring)","List(vauxhall, peugeot, VW)","List(List(Bristol, work), List(Bath, home), List(Paris, holiday))",Spring


In [5]:
# extracting data from an array (list) columnn
first_car = people_df.withColumn("first_car", fn.col("cars")[0])
display(first_car)

age,name,cars,locations,first_car
34,"List(Alice, Summers)",List(honda),"List(List(Glasgow, work), List(Edinburgh, home))",honda
33,"List(Bob, Winters)","List(ford, BMW)","List(List(Glasgow, work), List(Edinburgh, home))",ford
35,"List(Charlie, Spring)","List(vauxhall, peugeot, VW)","List(List(Bristol, work), List(Bath, home), List(Paris, holiday))",vauxhall


In [6]:
# extracting a defined value to a new dataframe
single_value_people_df = people_df.withColumn("first_city", fn.col("locations")[0]["city"])
display(single_value_people_df)

age,name,cars,locations,first_city
34,"List(Alice, Summers)",List(honda),"List(List(Glasgow, work), List(Edinburgh, home))",Glasgow
33,"List(Bob, Winters)","List(ford, BMW)","List(List(Glasgow, work), List(Edinburgh, home))",Glasgow
35,"List(Charlie, Spring)","List(vauxhall, peugeot, VW)","List(List(Bristol, work), List(Bath, home), List(Paris, holiday))",Bristol


In [7]:
expanded_people_df = people_df.withColumn("cities", fn.col("locations")["city"])

In [8]:
display(expanded_people_df)

age,name,cars,locations,cities
34,"List(Alice, Summers)",List(honda),"List(List(Glasgow, work), List(Edinburgh, home))","List(Glasgow, Edinburgh)"
33,"List(Bob, Winters)","List(ford, BMW)","List(List(Glasgow, work), List(Edinburgh, home))","List(Glasgow, Edinburgh)"
35,"List(Charlie, Spring)","List(vauxhall, peugeot, VW)","List(List(Bristol, work), List(Bath, home), List(Paris, holiday))","List(Bristol, Bath, Paris)"


In [9]:
extracted_people_df = people_df.withColumn("location", fn.col("locations")[0])

In [10]:
display(extracted_people_df)

age,name,cars,locations,location
34,"List(Alice, Summers)",List(honda),"List(List(Glasgow, work), List(Edinburgh, home))","List(Glasgow, work)"
33,"List(Bob, Winters)","List(ford, BMW)","List(List(Glasgow, work), List(Edinburgh, home))","List(Glasgow, work)"
35,"List(Charlie, Spring)","List(vauxhall, peugeot, VW)","List(List(Bristol, work), List(Bath, home), List(Paris, holiday))","List(Bristol, work)"


In [11]:
# Alternatively, a DataFrame can be created for a JSON dataset represented by
# an RDD[String] storing one JSON object per string
class_json = ['[{"subject":"maths", "performance":{"names": ["Alice", "Bob", "Charlie", "Daniel", "Emily"], "scores":[45,56,67,34,89], "grades": ["D", "C", "B", "E", "A"]}},\
                 {"subject":"english", "performance":{"names": ["Alice", "Bob", "Charlie", "Daniel", "Emily"], "scores":[79,54,62,39,64], "grades": ["A", "C", "B", "E", "B"]}},\
                 {"subject":"history", "performance":{"names": ["Alice", "Bob", "Charlie", "Daniel", "Emily"], "scores":[47,76,61,44,79], "grades": ["D", "A", "B", "D", "A"]}}]']
class_rdd = sc.parallelize(class_json)
class_df = spark.read.json(class_rdd)
class_df = class_df.select("subject", "performance")
class_df.show()

In [12]:
display(class_df)

subject,performance
maths,"List(List(D, C, B, E, A), List(Alice, Bob, Charlie, Daniel, Emily), List(45, 56, 67, 34, 89))"
english,"List(List(A, C, B, E, B), List(Alice, Bob, Charlie, Daniel, Emily), List(79, 54, 62, 39, 64))"
history,"List(List(D, A, B, D, A), List(Alice, Bob, Charlie, Daniel, Emily), List(47, 76, 61, 44, 79))"


In [13]:
# we can pull out a specified value from a map of arrays to a new column
grade_df = class_df.withColumn("first_grade", fn.col("performance")["grades"][0])

In [14]:
display(grade_df)

subject,performance,first_student_grade
maths,"List(List(D, C, B, E, A), List(Alice, Bob, Charlie, Daniel, Emily), List(45, 56, 67, 34, 89))",D
english,"List(List(A, C, B, E, B), List(Alice, Bob, Charlie, Daniel, Emily), List(79, 54, 62, 39, 64))",A
history,"List(List(D, A, B, D, A), List(Alice, Bob, Charlie, Daniel, Emily), List(47, 76, 61, 44, 79))",D


In [15]:
# however we cannot directly extract all first elements from a map of arrays
extracted_class_df = class_df.withColumn("first_student", fn.col("performance")[0])
# this gives an ERROR!

In [16]:
# using .* to extract a struct column to separate columns for onward processing
separated_class_df = class_df.select("subject", "performance.*")
    
display(separated_class_df)    

subject,grades,names,scores
maths,"List(D, C, B, E, A)","List(Alice, Bob, Charlie, Daniel, Emily)","List(45, 56, 67, 34, 89)"
english,"List(A, C, B, E, B)","List(Alice, Bob, Charlie, Daniel, Emily)","List(79, 54, 62, 39, 64)"
history,"List(D, A, B, D, A)","List(Alice, Bob, Charlie, Daniel, Emily)","List(47, 76, 61, 44, 79)"
