In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
data = [
    {
        "id": 1,
        "name": "John Doe",
        "contact": {
            "phone": "123-456-7890",
            "email": "john.doe@example.com"
        },
        "address": {
            "street": "123 Main St",
            "city": "Anytown",
            "state": "Anystate",
            "zipcode": "12345"
        },
        "orders": [
            {"order_id": 101, "product": "Laptop", "quantity": 1, "price": 999.99},
            {"order_id": 102, "product": "Smartphone", "quantity": 2, "price": 599.99}
        ]
    }
]


In [0]:
# Define the custom schema for the data
custom_schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("contact", StructType([
        StructField("phone", StringType(), True),
        StructField("email", StringType(), True)
    ]), True),
    StructField("address", StructType([
        StructField("street", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("zipcode", StringType(), True)
    ]), True),
    StructField("orders", ArrayType(StructType([
        StructField("order_id", IntegerType(), False),
        StructField("product", StringType(), True),
        StructField("quantity", IntegerType(), True),
        StructField("price", DoubleType(), True)
    ])), True)
])


In [0]:
df = spark.createDataFrame(data, custom_schema)

In [0]:
# Explode the 'orders' array
exploded_df = df.withColumn("order", explode(col("orders")))


In [0]:
# Select relevant columns and flatten the nested fields
# flattened_df = exploded_df.select(
#     "id",
#     "name",
#     col("contact.phone").alias("contact_phone"),
#     col("contact.email").alias("contact_email"),
#     col("address.street").alias("address_street"),
#     col("address.city").alias("address_city"),
#     col("address.state").alias("address_state"),
#     col("address.zipcode").alias("address_zipcode"),
#     col("order.order_id").alias("order_id"),
#     col("order.product").alias("product"),
#     col("order.quantity").alias("quantity"),
#     col("order.price").alias("price")
# )
flattened_df = exploded_df.select(
    "id",
    "name",
    col("contact.*"),
    col("address.*"),
    col("order.*"),
)

In [0]:
# Show the resulting DataFrame
flattened_df.display()


id,name,phone,email,street,city,state,zipcode,order_id,product,quantity,price
1,John Doe,123-456-7890,john.doe@example.com,123 Main St,Anytown,Anystate,12345,101,Laptop,1,999.99
1,John Doe,123-456-7890,john.doe@example.com,123 Main St,Anytown,Anystate,12345,102,Smartphone,2,599.99
